ToddThomson
diff --git a/‎.github/copilot-instructions.md‎
Lines changed: 35 additions & 38 deletions b/‎.github/copilot-instructions.md‎
Lines changed: 35 additions & 38 deletions
diff --git a/‎Mila/CMakeLists.txt‎
Lines changed: 24 additions & 20 deletions b/‎Mila/CMakeLists.txt‎
Lines changed: 24 additions & 20 deletions
diff --git a/‎Mila/Src/Dnn/Data/Gpt2Tokenizer.ixx‎ ‎Mila/Dev/Data/Gpt2Tokenizer.ixx‎Mila/Src/Dnn/Data/Gpt2Tokenizer.ixx renamed to Mila/Dev/Data/Gpt2Tokenizer.ixx
Lines changed: 6 additions & 6 deletions b/‎Mila/Src/Dnn/Data/Gpt2Tokenizer.ixx‎ ‎Mila/Dev/Data/Gpt2Tokenizer.ixx‎Mila/Src/Dnn/Data/Gpt2Tokenizer.ixx renamed to Mila/Dev/Data/Gpt2Tokenizer.ixx
Lines changed: 6 additions & 6 deletions
diff --git a/‎Mila/Src/Dnn/Data/Llama3Tokenizer.ixx‎ ‎Mila/Dev/Data/Llama3Tokenizer.ixx‎Mila/Src/Dnn/Data/Llama3Tokenizer.ixx renamed to Mila/Dev/Data/Llama3Tokenizer.ixx
Lines changed: 16 additions & 13 deletions b/‎Mila/Src/Dnn/Data/Llama3Tokenizer.ixx‎ ‎Mila/Dev/Data/Llama3Tokenizer.ixx‎Mila/Src/Dnn/Data/Llama3Tokenizer.ixx renamed to Mila/Dev/Data/Llama3Tokenizer.ixx
Lines changed: 16 additions & 13 deletions
diff --git a/‎…ata/Tokenizers/Bpe/Gpt2/BPETokenizer.ixx‎ ‎Mila/Dev/Gpt2/BPETokenizer.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BPETokenizer.ixx renamed to Mila/Dev/Gpt2/BPETokenizer.ixx b/‎…ata/Tokenizers/Bpe/Gpt2/BPETokenizer.ixx‎ ‎Mila/Dev/Gpt2/BPETokenizer.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BPETokenizer.ixx renamed to Mila/Dev/Gpt2/BPETokenizer.ixx
diff --git a/‎…/Data/Tokenizers/Bpe/Gpt2/BPETrainer.ixx‎ ‎Mila/Dev/Gpt2/BPETrainer.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BPETrainer.ixx renamed to Mila/Dev/Gpt2/BPETrainer.ixx b/‎…/Data/Tokenizers/Bpe/Gpt2/BPETrainer.ixx‎ ‎Mila/Dev/Gpt2/BPETrainer.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BPETrainer.ixx renamed to Mila/Dev/Gpt2/BPETrainer.ixx
diff --git a/‎…ta/Tokenizers/Bpe/Gpt2/BPEVocabulary.ixx‎ ‎Mila/Dev/Gpt2/BPEVocabulary.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BPEVocabulary.ixx renamed to Mila/Dev/Gpt2/BPEVocabulary.ixx b/‎…ta/Tokenizers/Bpe/Gpt2/BPEVocabulary.ixx‎ ‎Mila/Dev/Gpt2/BPEVocabulary.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BPEVocabulary.ixx renamed to Mila/Dev/Gpt2/BPEVocabulary.ixx
diff --git a/‎…enizers/Bpe/Gpt2/BpeVocabularyConfig.ixx‎ ‎Mila/Dev/Gpt2/BpeVocabularyConfig.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BpeVocabularyConfig.ixx renamed to Mila/Dev/Gpt2/BpeVocabularyConfig.ixx b/‎…enizers/Bpe/Gpt2/BpeVocabularyConfig.ixx‎ ‎Mila/Dev/Gpt2/BpeVocabularyConfig.ixx‎Mila/Src/Data/Tokenizers/Bpe/Gpt2/BpeVocabularyConfig.ixx renamed to Mila/Dev/Gpt2/BpeVocabularyConfig.ixx
@@ -1,67 +1,64 @@
 # Mila — Copilot Instructions
 
-## Code generation policy
+## General Guidelines
 - Generate code only when explicitly requested (e.g., "implement", "update", "write code", "generate", "create code"). Otherwise provide analysis, design guidance, and minimal examples.
-- Mila is at the Alpha stage of development, please do not consider backward compatibility with previous versions when generating code.
+- Mila is at the Alpha stage of development; please do not consider backward compatibility with previous versions when generating code.
 
-## Doxygen / file header policy
-- File-level Doxygen comments must be concise summaries (one to three short sentences).
-  - Purpose: provide a quick summary of the file intent for readers and tools.
-  - Must NOT repeat detailed API, implementation notes, or usage examples.
-- Detailed documentation belongs in the module/class/function-level Doxygen comments (module API).
-  - Put behavior, parameters, return semantics, ownership/lifetime, threading assumptions, and examples on the relevant symbol's Doxygen block.
-  - Module-level comments (module partitions) should describe the public API surface and usage patterns.
-- Example file-level header (preferred):
-  - Brief one-line summary: "Configuration for the Residual module."
-  - Optional short second sentence for scope: "Provides fluent setters used by Residual and backend factories."
-- Rationale: keeps files scannable and avoids duplicated, stale documentation across many files.
-
-## Coding Style
-
-### Blank Lines Around Blocks
-- Add blank line before control flow blocks (if, for, while, switch)
-- Add blank line after closing brace of blocks
-- Exception: No blank line between `} else {` or `} catch {`
-
-### Blank Lines Around Return Statements
-- Add blank line before `return` statement (final return in function)
-- Exception: Early returns (guard clauses) don't need blank line
-- Exception: Single-statement functions don't need blank line
+## Code Style
+- Do not columnize/align code with extra spaces. Identifiers and types should use standard single-space formatting. Column alignment breaks when names change.
+- Add blank line before control flow blocks (if, for, while, switch).
+- Add blank line after closing brace of blocks.
+- Exception: No blank line between `} else {` or `} catch {`.
+- Add blank line before `return` statement (final return in function).
+- Exception: Early returns (guard clauses) don't need blank line.
+- Exception: Single-statement functions don't need blank line.
 
 ## High-level constraints
 - Project is alpha: breaking changes and simplifications are acceptable.
 - Backward compatibility is NOT required. Do not use Deprecated APIs.
-- Do not use Mila deprecated API
+- Do not use Mila deprecated API.
 - Host code: C++23 using modules and module partitions. Tests: GTest. Build: CMake + Ninja.
 
-## Comment policy
+## Comment Policy
 - NEVER generate trivial comments that simply restate what the code does. For example, do not generate comments like:
   - `// increment i` for the line `i++;`
   Such trivial, repetitive comments must not be produced by Copilot.
-- Use only ASCII characters (no Unicode checkmarks, emojis, or special symbols)
-- Don't add simple validation comments (e.g., "Good", "Correct", "OK", "Bad")
-- Comments should explain WHAT the code's intent or contract is, or WHY a non-obvious approach is required — not restate HOW the code performs obvious operations.
-  - Good: `// accumulate running mean across batch to avoid a second pass`
-  - Good: `// Use integer index to preserve pointer stability required by the SIMD kernel`
+- Use only ASCII characters (no Unicode checkmarks, emojis, or special symbols).
+- Don't add simple validation comments (e.g., "Good", "Correct", "OK", "Bad").
+- Comments should explain WHAT the code's intent or contract is, or WHY a non-obvious approach is required—not restate HOW the code performs obvious operations.
+  - Good: `// accumulate running mean across batch to avoid a second pass`.
+  - Good: `// Use integer index to preserve pointer stability required by the SIMD kernel`.
 - Prefer documenting:
   - Function/module contract: inputs, outputs, side-effects, threading assumptions, and performance/precision trade-offs.
   - Non-obvious algorithms, invariants, and corner cases that callers or maintainers must preserve.
   - API expectations: ownership, lifetime, and accumulation semantics (overwrite vs accumulate).
 - Keep comments technical and informative, not evaluative or apologetic.
 - Do not include reasoning or justification for design decisions in code comments (keep rationale in design documents or commit messages).
 - Avoid commenting trivial lines of code that are self-explanatory; prefer a brief block comment describing the overall purpose of the surrounding code instead.
-- Documentation comments (Doxygen) should describe behavior, usage, public contracts and examples — not explain why changes were made.
+- Documentation comments (Doxygen) should describe behavior, usage, public contracts, and examples—not explain why changes were made.
+
+## Doxygen / File Header Policy
+- File-level Doxygen comments must be concise summaries (one to three short sentences).
+  - Purpose: provide a quick summary of the file intent for readers and tools.
+  - Must NOT repeat detailed API, implementation notes, or usage examples.
+- Detailed documentation belongs in the module/class/function-level Doxygen comments (module API).
+  - Put behavior, parameters, return semantics, ownership/lifetime, threading assumptions, and examples on the relevant symbol's Doxygen block.
+  - Module-level comments (module partitions) should describe the public API surface and usage patterns.
+- Example file-level header (preferred):
+  - Brief one-line summary: "Configuration for the Residual module."
+  - Optional short second sentence for scope: "Provides fluent setters used by Residual and backend factories."
+- Rationale: keeps files scannable and avoids duplicated, stale documentation across many files.
 
-## Doxygen guidance for generated code
+## Doxygen Guidance for Generated Code
 - When emitting Doxygen for symbols:
   - Use the full signature and describe preconditions, postconditions, and side-effects.
   - Prefer param/return tags for public methods.
   - Use short examples only in the symbol comment (not in file headers).
 - Avoid emitting long prose in file headers; put detail in the API-level documentation.
 
-## Notes for AI assistant
+## Notes for AI Assistant
 - When recommending code, prefer modern C++ idioms (RAII, smart pointers, STL algorithms).
 - Always include testing suggestions and consider CPU/CUDA parity.
-- In explanatory text (not code), you may use formatting symbols for clarity, but generated code comments must follow the comment policy above
-- Keep commit messages and explanatory responses separate from code documentation
-- Unit tests are structured by project, namespace and class — place tests under the Tests tree following the repository project layout and mirror the production namespace/class organization.
+- In explanatory text (not code), you may use formatting symbols for clarity, but generated code comments must follow the comment policy above.
+- Keep commit messages and explanatory responses separate from code documentation.
+- Unit tests are structured by project, namespace, and class—place tests under the Tests tree following the repository project layout and mirror the production namespace/class organization.
@@ -289,9 +289,9 @@ PUBLIC
         #----------------------------------------------------------------------
         # Dnn / Data
         #----------------------------------------------------------------------
-        "Src/Dnn/Data/DataLoader.ixx"
-        "Src/Dnn/Data/TokenSequenceLoader.ixx"
-        "Src/Dnn/Data/TokenSequenceLoader.Config.ixx"
+        "Src/Data/Loaders/DataLoader.ixx"
+        "Src/Data/Loaders/TokenSequenceLoader.ixx"
+        "Src/Data/Loaders/TokenSequenceLoader.Config.ixx"
 
         #---------------------------------------------------------------
         # Dnn / Serialization
@@ -381,31 +381,36 @@ PUBLIC
 
         "Src/Dnn/Components/Transformers/LlaMa/Llama.Presets.ixx"
 
-        "Src/Dnn/Data/Tokenizer.ixx"
-        "Src/Dnn/Data/Gpt2Tokenizer.ixx"
-        "Src/Dnn/Data/Llama3Tokenizer.ixx"
-        "Src/Dnn/Data/TokenizerVocabulary.ixx"
-        "Src/Dnn/Data/TokenizerType.ixx"
+        "Src/Data/Tokenizers/Tokenizer.ixx"
+        # DEPRECATED: "Src/Dnn/Data/Gpt2Tokenizer.ixx"
+        # DEPRECATED: "Src/Dnn/Data/Llama3Tokenizer.ixx"
+        "Src/Data/Tokenizers/TokenizerVocabulary.ixx"
+        "Src/Data/Tokenizers/TokenizerType.ixx"
 
         "Src/Data/Core/FileHeader.ixx"        
         "Src/Data/Core/TokenizerTrainer.ixx"
         "Src/Data/Core/TrainerFactory.ixx"
 
-        "Src/Data/Tokenizers/Bpe/Gpt2/BPETokenizer.ixx"
-        "Src/Data/Tokenizers/Bpe/Gpt2/BpeTrainer.ixx"
-        "Src/Data/Tokenizers/Bpe/Gpt2/BPEVocabulary.ixx"
-        "Src/Data/Tokenizers/Bpe/Gpt2/BpeVocabularyConfig.ixx"
-
-        "Src/Data/Tokenizers/Bpe/Gpt4/Gpt4Tokenizer.ixx"
-        "Src/Data/Tokenizers/Bpe/Gpt4/Gpt4Vocabulary.ixx"
-        "Src/Data/Tokenizers/Bpe/Gpt4/Gpt4Vocabulary.Config.ixx"
+        # REVIEW: Unified Bpe tockenizer
+        #"Src/Data/Tokenizers/Bpe/Gpt2/BPETokenizer.ixx"
+        #"Src/Data/Tokenizers/Bpe/Gpt2/BpeTrainer.ixx"
+        #"Src/Data/Tokenizers/Bpe/Gpt2/BPEVocabulary.ixx"
+        #"Src/Data/Tokenizers/Bpe/Gpt2/BpeVocabularyConfig.ixx"
+        #"Src/Data/Tokenizers/Bpe/Gpt4/Gpt4Tokenizer.ixx"
+        #"Src/Data/Tokenizers/Bpe/Gpt4/Gpt4Vocabulary.ixx"
+        #"Src/Data/Tokenizers/Bpe/Gpt4/Gpt4Vocabulary.Config.ixx"
 
         "Src/Data/Tokenizers/Char/CharTokenizer.ixx"
         "Src/Data/Tokenizers/Char/CharVocabularyConfig.ixx"
         "Src/Data/Tokenizers/Char/CharTrainer.ixx"
         "Src/Data/Tokenizers/Char/CharVocabulary.ixx"
 
-        "Src/Data/Tokenizers/Bpe/PreTokenizationMode.ixx"
+        "Src/Data/Tokenizers/Bpe/BpeVocabularyConfig.ixx"
+        "Src/Data/Tokenizers/Bpe/BpeVocabulary.ixx"
+        "Src/Data/Tokenizers/Bpe/BpeTokenizer.ixx"
+        "Src/Data/Tokenizers/Bpe/BpeTrainer.ixx"
+
+        "Src/Data/Tokenizers/Bpe/BpePreTokenizationMode.ixx"
 
         "Src/Dnn/Components/Transformers/GenerateParams.ixx"
         "Src/Data/Tokenizers/SpecialTokens.ixx"
@@ -430,7 +435,6 @@ PUBLIC
         "Src/Dnn/Components/Attention/GQA/GroupedQueryAttention.Config.ixx"
 
         "Src/Dnn/Compute/Operations/PairedOperation.ixx"
-        
 )
 
 set(MILA_INSTALL_FILE_SET_ARGS FILE_SET module_files DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/mila/modules)
@@ -780,8 +784,8 @@ endif()
 # Generate documentation with Doxygen
 add_subdirectory( Docs )
 
-# Add data tokenization tools
-add_subdirectory( Src/Data/Tools )
+# Add tools: tokenizer trainer, model exporter, etc.
+add_subdirectory( Tools )
 
 # Configure code coverage for MSVC
 if( MILA_ENABLE_COVERAGE AND MSVC)
 
@@ -1,5 +1,5 @@
 /**
- * @file Gpt2Tokenizer.ixx
+ * @file Gpt2Tokenizer_old.ixx
  * @brief GPT-style BPE tokenizer and binary loader used by Mila.
  *
  * Loads a compact binary tokenizer format and provides encode/decode functionality.
@@ -20,7 +20,7 @@ module;
 #include <functional>
 #include <limits>
 
-export module Data.Gpt2Tokenizer;
+export module Data.Gpt2Tokenizer_old_old;
 
 import Data.Tokenizer;
 
@@ -50,7 +50,7 @@ namespace Mila::Dnn::Data
      * mutations. Concurrent read-only encode/decode usage is acceptable when no
      * writer modifies state.
      */
-    export class Gpt2Tokenizer : public Tokenizer {
+    export class Gpt2Tokenizer_old : public Tokenizer {
     public:
         /**
          * @brief Create a tokenizer by loading the binary file at `path`.
@@ -60,8 +60,8 @@ namespace Mila::Dnn::Data
          * Preconditions: `path` points to a file produced by the repository
          * conversion utility or another producer that follows the same layout.
          */
-        static std::unique_ptr<Gpt2Tokenizer> fromFile( const std::string& path ) {
-            auto tokenizer = std::unique_ptr<Gpt2Tokenizer>( new Gpt2Tokenizer() );
+        static std::unique_ptr<Gpt2Tokenizer_old> fromFile( const std::string& path ) {
+            auto tokenizer = std::unique_ptr<Gpt2Tokenizer_old>( new Gpt2Tokenizer_old() );
             if ( !tokenizer->loadFromBinary( path ) ) {
                 return nullptr;
             }
@@ -162,7 +162,7 @@ namespace Mila::Dnn::Data
         }
 
     private:
-        Gpt2Tokenizer() = default;
+        Gpt2Tokenizer_old() = default;
 
         /**
          * @brief Load the tokenizer from the repository binary layout.
 
@@ -8,15 +8,18 @@ module;
 #include <memory>
 #include <algorithm>
 
-export module Data.LlamaTokenizer;
+export module Data.LlamaTokenizer_old;
 
 import Data.Tokenizer;
 
 namespace Mila::Dnn::Data
 {
-    export class LlamaTokenizer : public Tokenizer {
+    // DEPRECATED: This is the original LLaMA tokenizer implementation, retained for reference
+    // TODO: Remove this class after the new LLaMA tokenizer is fully implemented and tested.
+
+    export class LlamaTokenizer_old : public Tokenizer {
     public:
-        static std::unique_ptr<LlamaTokenizer> fromFile( const std::string& path );
+        static std::unique_ptr<LlamaTokenizer_old> fromFile( const std::string& path );
 
         std::vector<TokenId> encode( const std::string& text ) override;
         std::string decode( std::span<const TokenId> tokens ) override;
@@ -41,7 +44,7 @@ namespace Mila::Dnn::Data
         bool isValidToken( TokenId tokenId ) const override;
 
     private:
-        LlamaTokenizer() = default;
+        LlamaTokenizer_old() = default;
 
         bool loadFromBinary( const std::string& path );
 
@@ -67,8 +70,8 @@ namespace Mila::Dnn::Data
         bool useByteFallback_{ true };
     };
 
-    std::unique_ptr<LlamaTokenizer> LlamaTokenizer::fromFile( const std::string& path ) {
-        auto tokenizer = std::unique_ptr<LlamaTokenizer>( new LlamaTokenizer() );
+    std::unique_ptr<LlamaTokenizer_old> LlamaTokenizer_old::fromFile( const std::string& path ) {
+        auto tokenizer = std::unique_ptr<LlamaTokenizer_old>( new LlamaTokenizer_old() );
 
         if ( !tokenizer->loadFromBinary( path ) ) {
             return nullptr;
@@ -77,7 +80,7 @@ namespace Mila::Dnn::Data
         return tokenizer;
     }
 
-    bool LlamaTokenizer::loadFromBinary( const std::string& path ) {
+    bool LlamaTokenizer_old::loadFromBinary( const std::string& path ) {
         std::ifstream file( path, std::ios::binary );
 
         if ( !file ) {
@@ -150,16 +153,16 @@ namespace Mila::Dnn::Data
         return true;
     }
 
-    std::string LlamaTokenizer::normalizeText( const std::string& text ) const {
+    std::string LlamaTokenizer_old::normalizeText( const std::string& text ) const {
         return " " + text;
     }
 
-    std::vector<TokenId> LlamaTokenizer::encode( const std::string& text ) {
+    std::vector<TokenId> LlamaTokenizer_old::encode( const std::string& text ) {
         std::string normalized = normalizeText( text );
         return sentencePieceEncode( normalized );
     }
 
-    std::vector<TokenId> LlamaTokenizer::sentencePieceEncode( const std::string& text ) const {
+    std::vector<TokenId> LlamaTokenizer_old::sentencePieceEncode( const std::string& text ) const {
         std::vector<TokenId> result;
         size_t pos = 0;
 
@@ -205,7 +208,7 @@ namespace Mila::Dnn::Data
         return result;
     }
 
-    std::string LlamaTokenizer::decode( std::span<const TokenId> tokens ) {
+    std::string LlamaTokenizer_old::decode( std::span<const TokenId> tokens ) {
         std::string result;
 
         for ( auto tokenId : tokens ) {
@@ -237,12 +240,12 @@ namespace Mila::Dnn::Data
         return result;
     }
 
-    std::string LlamaTokenizer::tokenToString( TokenId tokenId ) const {
+    std::string LlamaTokenizer_old::tokenToString( TokenId tokenId ) const {
         auto it = idToPiece_.find( tokenId );
         return it != idToPiece_.end() ? it->second : "<UNK>";
     }
 
-    bool LlamaTokenizer::isValidToken( TokenId tokenId ) const {
+    bool LlamaTokenizer_old::isValidToken( TokenId tokenId ) const {
         return idToPiece_.contains( tokenId );
     }
 }