ToddThomson
diff --git a/‎Mila/CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions b/‎Mila/CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎Mila/Samples/Chat/Src/Chat.ixx‎
Lines changed: 66 additions & 77 deletions b/‎Mila/Samples/Chat/Src/Chat.ixx‎
Lines changed: 66 additions & 77 deletions
diff --git a/‎Mila/Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/CudaTokenEmbeddingOp.Dispatch.ixx‎
Lines changed: 9 additions & 9 deletions b/‎Mila/Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/CudaTokenEmbeddingOp.Dispatch.ixx‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎Mila/Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/CudaTokenEmbeddingOp.ixx‎
Lines changed: 2 additions & 2 deletions b/‎Mila/Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/CudaTokenEmbeddingOp.ixx‎
Lines changed: 2 additions & 2 deletions
@@ -10,13 +10,14 @@ add_library( Mila STATIC
     # Cuda Operations Kernels
     #--------------------------------------------------------------------------
     "Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/Kernels/TokenEmbedding.cuh"
-    "Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/Kernels/TokenEmbedding.cu"
+    "Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/Kernels/TokenEmbedding.Fp32.cu"
+    "Src/Dnn/Compute/Devices/Cuda/Operations/Embeddings/Kernels/TokenEmbedding.Bf16.cu"
 
     "Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Lpe/Kernels/Lpe.Fp32.cu"
     "Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Lpe/Kernels/Lpe.Fp16.cu"
     "Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Lpe/Kernels/Lpe.cuh"
     "Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Rope/Kernels/Rope.Fp32.cu"
-    #"Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Rope/Kernels/Rope.Fp16.cu"
+    "Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Rope/Kernels/Rope.Bf16.cu"
     "Src/Dnn/Compute/Devices/Cuda/Operations/Encodings/Rope/Kernels/Rope.cuh"
 
     "Src/Dnn/Compute/Devices/Cuda/Operations/Attention/Common/Kernels/CudaAttention.cuh"
@@ -446,7 +447,7 @@ PUBLIC
         "Src/Dnn/Components/Attention/GQA/GroupedQueryAttention.Config.ixx"
 
         "Src/Dnn/Compute/Operations/PairedOperation.ixx"
- "Src/Dnn/Core/Component.MemoryStats.ixx" "Src/Dnn/Core/Model.RuntimeMode.ixx" "Src/Dnn/Core/Model.ixx" "Src/Dnn/Core/LanguageModel.ixx" "Src/Dnn/Core/Comonent.TrainingMode.ixx" "Src/Dnn/Compute/Operations/IPositionalPairedOp.ixx" "Src/Dnn/Tensors/Operations/TensorOps.Random.ixx" "Src/Dnn/Tensors/Operations/TensorOps.Structural.ixx" "Src/Dnn/Compute/Operations/IPositionalDecode.ixx" "Src/Dnn/Compute/Operations/IKvInference.ixx" "Src/Dnn/Compute/Operations/IPackedKvInference.ixx")
+ "Src/Dnn/Core/Component.MemoryStats.ixx" "Src/Dnn/Core/Model.RuntimeMode.ixx" "Src/Dnn/Core/Model.ixx" "Src/Dnn/Core/LanguageModel.ixx" "Src/Dnn/Core/Comonent.TrainingMode.ixx" "Src/Dnn/Compute/Operations/IPositionalPairedOp.ixx" "Src/Dnn/Tensors/Operations/TensorOps.Random.ixx" "Src/Dnn/Tensors/Operations/TensorOps.Structural.ixx" "Src/Dnn/Compute/Operations/IPositionalDecode.ixx" "Src/Dnn/Compute/Operations/IKvInference.ixx" "Src/Dnn/Compute/Operations/IPackedKvInference.ixx" "Src/Dnn/Core/TokenStreamer.ixx")
 
 set(MILA_INSTALL_FILE_SET_ARGS FILE_SET module_files DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/mila/modules)
 
 
@@ -15,6 +15,8 @@ module;
 #include <format>
 #include <memory>
 #include <stdexcept>
+#include <future>
+#include <stop_token>
 
 export module Mila.Chat;
 export import Chat.Config;
@@ -25,7 +27,6 @@ namespace Mila::ChatApp
     using namespace Mila::Dnn;
     using namespace Mila::Dnn::Compute;
     using namespace Mila::Data;
-    using namespace Mila::Data;
 
     using LanguageModelType = LanguageModel<DeviceType::Cuda, TensorDataType::FP32>;
 
@@ -84,11 +85,34 @@ namespace Mila::ChatApp
 
                 conversation_history.push_back( "User: " + user_input );
 
-                std::string response = generateResponse( conversation_history );
+                const std::string& prompt = conversation_history.back().substr( 6 );
+                std::vector<TokenId> prompt_tokens = tokenizer_->encode( prompt );
+                std::vector<int32_t> input_tokens( prompt_tokens.begin(), prompt_tokens.end() );
+
+                std::string response;
+                response.reserve( 512 );
+
+                std::cout << "\nMila: ";
 
-                conversation_history.push_back( "Mila: " + response );
+                stop_src_ = std::stop_source{};
 
-                std::cout << "\nMila: " << response << "\n";
+                auto fut = model_->generateAsync(
+                    input_tokens,
+                    [&]( int32_t tok )
+                    {
+                        auto text = tokenizer_->decode( std::vector<TokenId>{ static_cast<TokenId>(tok) } );
+                        response += text;
+                        std::cout << text << std::flush;
+                    },
+                    config_.max_new_tokens,
+                    config_.temperature,
+                    config_.top_k,
+                    stop_src_.get_token() );
+
+                fut.wait();
+                std::cout << '\n';
+
+                conversation_history.push_back( "Mila: " + trimResponse( response ) );
             }
         }
 
@@ -123,92 +147,56 @@ namespace Mila::ChatApp
 
         void loadModel()
         {
-            //try
-            //{
-                std::cout << "Loading model from: " << config_.model_path << "\n";
+            std::cout << "Loading model from: " << config_.model_path << "\n";
 
-                switch ( config_.model_type )
-                {
-                    case ModelType::Gpt:
-                        model_ = GptModel<DeviceType::Cuda, TensorDataType::FP32>::fromPretrained(
-                            config_.model_path,
-                            config_.context_length,
-                            DeviceId{ DeviceType::Cuda, 0 },
-                            /*strict=*/true );
-                        break;
-
-                    case ModelType::Llama:
-                        model_ = LlamaModel<DeviceType::Cuda, TensorDataType::FP32>::fromPretrained(
-                            config_.model_path,
-                            config_.context_length,
-                            DeviceId{ DeviceType::Cuda, 0 },
-                            /*strict=*/true );
-                        break;
-                }
-
-                std::cout << model_->toString();
-                
-                auto stats = model_->getMemoryStats();
-                std::cout << stats.toString() << "\n";
-
-                std::cout << "Model loaded successfully!\n";
-            //}
-            //catch ( const std::exception& e )
-            //{
-            //    std::cerr << "Error loading model: " << e.what() << "\n";
-            //    throw;
-            //}
-        }
-
-        std::string generateResponse( const std::vector<std::string>& history )
-        {
-            /*try
-            {*/
-                if ( !tokenizer_ )
-                    return "Tokenizer not loaded.";
-
-                // Both GPT-2 and LLaMA base models are completion models; pass the raw
-                // user text without a chat template to avoid instruction-format mismatch.
-                const std::string& prompt = history.back().substr( 6 );  // strip "User: "
-
-                std::vector<TokenId> prompt_tokens = tokenizer_->encode( prompt );
+            switch ( config_.model_type )
+            {
+                case ModelType::Gpt:
+                    model_ = GptModel<DeviceType::Cuda, TensorDataType::FP32>::fromPretrained(
+                        config_.model_path,
+                        config_.context_length,
+                        DeviceId{ DeviceType::Cuda, 0 },
+                        /*strict=*/true );
+                    break;
 
-                std::vector<int32_t> input_tokens( prompt_tokens.begin(), prompt_tokens.end() );
+                case ModelType::Llama:
+                    model_ = LlamaModel<DeviceType::Cuda, TensorDataType::FP32>::fromPretrained(
+                        config_.model_path,
+                        config_.context_length,
+                        DeviceId{ DeviceType::Cuda, 0 },
+                        /*strict=*/true );
+                    break;
+            }
 
-                std::vector<int32_t> generated = model_->generate(
-                    std::vector<int32_t>( input_tokens ),
-                    config_.max_new_tokens,
-                    config_.temperature,
-                    config_.top_k );
+            std::cout << model_->toString();
 
-                std::string full_text = tokenizer_->decode( std::vector<TokenId>( generated.begin(), generated.end() ) );
+            auto stats = model_->getMemoryStats();
+            std::cout << stats.toString() << "\n";
 
-                return extractResponse( full_text, prompt );
-            /*}
-            catch ( const std::exception& e )
-            {
-                return "Error: " + std::string( e.what() );
-            }*/
+            std::cout << "Model loaded successfully!\n";
         }
 
-        std::string extractResponse(
-            const std::string& full_output,
-            const std::string& prompt ) const
+        /**
+         * @brief Strip leading whitespace and truncate at the first paragraph break.
+         *
+         * Applied to the accumulated streaming response before storing in history.
+         * The live printed output is unaffected.
+         */
+        std::string trimResponse( const std::string& raw ) const
         {
-            if ( full_output.size() <= prompt.size() )
-                return full_output;
+            auto start = raw.find_first_not_of( " \t\n\r" );
+
+            if ( start == std::string::npos )
+                return {};
 
-            std::string response = full_output.substr( prompt.size() );
+            std::string result = raw.substr( start );
 
-            auto start = response.find_first_not_of( " \t\n\r" );
-            if ( start != std::string::npos )
-                response = response.substr( start );
+            auto end = result.find( "\n\n" );
 
-            auto end = response.find( "\n\n" );
             if ( end != std::string::npos )
-                response = response.substr( 0, end );
+                result.resize( end );
 
-            return response;
+            return result;
         }
 
         void printWelcome() const
@@ -243,5 +231,6 @@ Just type your message to chat with Mila AI.
         ChatConfig config_;
         std::unique_ptr<LanguageModelType> model_;
         std::shared_ptr<BpeTokenizer> tokenizer_{ nullptr };
+        std::stop_source stop_src_;
     };
 }
@@ -15,7 +15,7 @@ export module Compute.CudaTokenEmbeddingOp:Dispatch;
 namespace Mila::Dnn::Compute::Cuda::TokenEmbedding::Detail
 {
     template <typename TNative>
-        requires std::is_same_v<TNative, float> || std::is_same_v<TNative, half>
+        requires std::is_same_v<TNative, float> || std::is_same_v<TNative, __nv_bfloat16>
     struct cuda_token_embedding_impl;
 
     // ========================================================================
@@ -48,31 +48,31 @@ namespace Mila::Dnn::Compute::Cuda::TokenEmbedding::Detail
     };
 
     // ========================================================================
-    // FP16 (stubs)
+    // BF16
     // ========================================================================
 
     template <>
-    struct cuda_token_embedding_impl<half>
+    struct cuda_token_embedding_impl<__nv_bfloat16>
     {
         static void forward(
-            half* Y, const int* X, const half* wte,
+            __nv_bfloat16* Y, const int* X, const __nv_bfloat16* wte,
             int B, int T, int C, cudaStream_t stream )
         {
-            // TODO: cuda_token_embedding_forward_fp16(...)
+            cuda_token_embedding_forward_bf16( Y, X, wte, B, T, C, stream );
         }
 
         static void backward(
-            half* dwte, const half* dY, const int* X,
+            __nv_bfloat16* dwte, const __nv_bfloat16* dY, const int* X,
             int B, int T, int C, cudaStream_t stream )
         {
-            // TODO: cuda_token_embedding_backward_fp16(...)
+            cuda_token_embedding_backward_bf16( dwte, dY, X, B, T, C, stream );
         }
 
         static void decode(
-            half* Y, const int* X, const half* wte,
+            __nv_bfloat16* Y, const int* X, const __nv_bfloat16* wte,
             int B, int C, cudaStream_t stream )
         {
-            // TODO: cuda_token_embedding_decode_fp16(...)
+            cuda_token_embedding_decode_bf16( Y, X, wte, B, C, stream );
         }
     };
 }
@@ -280,8 +280,8 @@ namespace Mila::Dnn::Compute::Cuda::TokenEmbedding
                 TensorDataType::INT32, TensorDataType::FP32>( "TokenEmbeddingOp" );
 
             registerUnaryOpType<DeviceType::Cuda,
-                CudaTokenEmbeddingOp<TensorDataType::INT32, TensorDataType::FP16>,
-                TensorDataType::INT32, TensorDataType::FP16>( "TokenEmbeddingOp" );
+                CudaTokenEmbeddingOp<TensorDataType::INT32, TensorDataType::BF16>,
+                TensorDataType::INT32, TensorDataType::BF16>( "TokenEmbeddingOp" );
         }
     };
 }
Original file line number	Diff line number	Diff line change
`@@ -15,7 +15,7 @@ export module Compute.CudaTokenEmbeddingOp:Dispatch;`
`15`	`15`	`namespace Mila::Dnn::Compute::Cuda::TokenEmbedding::Detail`
`16`	`16`	`{`
`17`	`17`	`template <typename TNative>`
`18`		`- requires std::is_same_v<TNative, float> \|\| std::is_same_v<TNative, half>`
	`18`	`+ requires std::is_same_v<TNative, float> \|\| std::is_same_v<TNative, __nv_bfloat16>`
`19`	`19`	`struct cuda_token_embedding_impl;`
`20`	`20`
`21`	`21`	`// ========================================================================`
`@@ -48,31 +48,31 @@ namespace Mila::Dnn::Compute::Cuda::TokenEmbedding::Detail`
`48`	`48`	`};`
`49`	`49`
`50`	`50`	`// ========================================================================`
`51`		`- // FP16 (stubs)`
	`51`	`+ // BF16`
`52`	`52`	`// ========================================================================`
`53`	`53`
`54`	`54`	`template <>`
`55`		`- struct cuda_token_embedding_impl<half>`
	`55`	`+ struct cuda_token_embedding_impl<__nv_bfloat16>`
`56`	`56`	`{`
`57`	`57`	`static void forward(`
`58`		`- half* Y, const int* X, const half* wte,`
	`58`	`+ __nv_bfloat16* Y, const int* X, const __nv_bfloat16* wte,`
`59`	`59`	`int B, int T, int C, cudaStream_t stream )`
`60`	`60`	`{`
`61`		`- // TODO: cuda_token_embedding_forward_fp16(...)`
	`61`	`+ cuda_token_embedding_forward_bf16( Y, X, wte, B, T, C, stream );`
`62`	`62`	`}`
`63`	`63`
`64`	`64`	`static void backward(`
`65`		`- half* dwte, const half* dY, const int* X,`
	`65`	`+ __nv_bfloat16* dwte, const __nv_bfloat16* dY, const int* X,`
`66`	`66`	`int B, int T, int C, cudaStream_t stream )`
`67`	`67`	`{`
`68`		`- // TODO: cuda_token_embedding_backward_fp16(...)`
	`68`	`+ cuda_token_embedding_backward_bf16( dwte, dY, X, B, T, C, stream );`
`69`	`69`	`}`
`70`	`70`
`71`	`71`	`static void decode(`
`72`		`- half* Y, const int* X, const half* wte,`
	`72`	`+ __nv_bfloat16* Y, const int* X, const __nv_bfloat16* wte,`
`73`	`73`	`int B, int C, cudaStream_t stream )`
`74`	`74`	`{`
`75`		`- // TODO: cuda_token_embedding_decode_fp16(...)`
	`75`	`+ cuda_token_embedding_decode_bf16( Y, X, wte, B, C, stream );`
`76`	`76`	`}`
`77`	`77`	`};`
`78`	`78`	`}`
Original file line number	Diff line number	Diff line change
`@@ -280,8 +280,8 @@ namespace Mila::Dnn::Compute::Cuda::TokenEmbedding`
`280`	`280`	`TensorDataType::INT32, TensorDataType::FP32>( "TokenEmbeddingOp" );`
`281`	`281`
`282`	`282`	`registerUnaryOpType<DeviceType::Cuda,`
`283`		`- CudaTokenEmbeddingOp<TensorDataType::INT32, TensorDataType::FP16>,`
`284`		`- TensorDataType::INT32, TensorDataType::FP16>( "TokenEmbeddingOp" );`
	`283`	`+ CudaTokenEmbeddingOp<TensorDataType::INT32, TensorDataType::BF16>,`
	`284`	`+ TensorDataType::INT32, TensorDataType::BF16>( "TokenEmbeddingOp" );`
`285`	`285`	`}`
`286`	`286`	`};`
`287`	`287`	`}`