withcatai
diff --git a/‎.github/workflows/build.yml‎
Lines changed: 17 additions & 29 deletions b/‎.github/workflows/build.yml‎
Lines changed: 17 additions & 29 deletions
diff --git a/‎docs/guide/CUDA.md‎
Lines changed: 11 additions & 11 deletions b/‎docs/guide/CUDA.md‎
Lines changed: 11 additions & 11 deletions
diff --git a/‎docs/guide/text-completion.md‎
Lines changed: 48 additions & 0 deletions b/‎docs/guide/text-completion.md‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎llama/CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions b/‎llama/CMakeLists.txt‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎llama/addon/AddonModel.cpp‎
Lines changed: 4 additions & 0 deletions b/‎llama/addon/AddonModel.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/bindings/AddonTypes.ts‎
Lines changed: 1 addition & 0 deletions b/‎src/bindings/AddonTypes.ts‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/cli/commands/ChatCommand.ts‎
Lines changed: 14 additions & 3 deletions b/‎src/cli/commands/ChatCommand.ts‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎src/cli/commands/CompleteCommand.ts‎
Lines changed: 13 additions & 3 deletions b/‎src/cli/commands/CompleteCommand.ts‎
Lines changed: 13 additions & 3 deletions
@@ -97,8 +97,8 @@ jobs:
       - name: Install dependencies on Windows
         if: startsWith(matrix.config.os, 'windows')
         run: |
-          choco install cmake.install --version=3.31.1
-          choco install cmake --version=3.31.1
+          choco install cmake.install --version=4.2.1
+          choco install cmake --version=4.2.1
           choco install ninja
 
       - name: Install dependencies on Ubuntu (1)
@@ -107,9 +107,9 @@ jobs:
           sudo apt-get update
           sudo apt-get install ninja-build libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
           
-          wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
-          sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
-          rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
+          wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz
+          sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz
+          rm -f ./cmake-4.2.1-linux-x86_64.tar.gz
           
           which aarch64-linux-gnu-gcc
           which aarch64-linux-gnu-g++
@@ -125,31 +125,19 @@ jobs:
           sudo apt-get update
           sudo apt-get install ninja-build libtbb-dev
           
-          wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
-          sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
-          rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
+          wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz
+          sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz
+          rm -f ./cmake-4.2.1-linux-x86_64.tar.gz
           
           cmake --version
 
-      - name: Install Cuda 13.0 on Windows (1)
+      - name: Install Cuda 13.1 on Windows (1)
         if: matrix.config.name == 'Windows (1)'
-        shell: bash
-        timeout-minutes: 60
-        run: |
-          curl -Lo cuda_13.0.0_windows_network.exe https://developer.download.nvidia.com/compute/cuda/13.0.0/network_installers/cuda_13.0.0_windows_network.exe
-          
-          echo "Installing Cuda 13.0.0"
-          powershell -Command "Start-Process -FilePath cuda_13.0.0_windows_network.exe -ArgumentList '-s','-n' -Wait"
-          echo "Cuda installation finished"
-          
-          rm -f ./cuda_13.0.0_windows_network.exe
-          
-          echo "where cudart64_13.dll: $(where cudart64_13.dll)"
-          
-          echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
-          echo "CUDA_PATH_V13_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
-          echo "CUDA_PATH_VX_Y=CUDA_PATH_V13_0" >> $GITHUB_ENV
-          echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\bin" >> $GITHUB_PATH
+        uses: Jimver/cuda-toolkit@v0.2.30
+        with:
+          cuda: '13.1.0'
+          method: 'network'
+          use-local-cache: false
 
       - name: Install Cuda 12.4 on Windows (2)
         if: matrix.config.name == 'Windows (2)'
@@ -160,11 +148,11 @@ jobs:
           sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
           use-local-cache: false
 
-      - name: Install Cuda 13.0 on Ubuntu (1)
+      - name: Install Cuda 13.1 on Ubuntu (1)
         if: matrix.config.name == 'Ubuntu (1)'
-        uses: Jimver/cuda-toolkit@v0.2.27
+        uses: Jimver/cuda-toolkit@v0.2.30
         with:
-          cuda: '13.0.0'
+          cuda: '13.1.0'
           method: 'network'
 
       - name: Install Cuda 12.4 on Ubuntu (2)
 
@@ -9,14 +9,14 @@ description: CUDA support in node-llama-cpp
 and these are automatically used when CUDA is detected on your machine.
 
 To use `node-llama-cpp`'s CUDA support with your NVIDIA GPU,
-make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher installed on your machine.
+make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher installed on your machine.
 
 If the pre-built binaries don't work with your CUDA installation,
 `node-llama-cpp` will automatically download a release of `llama.cpp` and build it from source with CUDA support.
 Building from source with CUDA support is slow and can take up to an hour.
 
-The pre-built binaries are compiled with CUDA Toolkit 12.4,
-so any version of CUDA Toolkit that is 12.4 or higher should work with the pre-built binaries.
+The pre-built binaries are compiled with CUDA Toolkits 12.4 and 13.1,
+so any CUDA Toolkit 12 that's on version 12.4 or higher or CUDA Toolkit 13 on version 13.1 or higher should work with the pre-built binaries.
 If you have an older version of CUDA Toolkit installed on your machine,
 consider updating it to avoid having to wait the long build time.
 
@@ -42,7 +42,7 @@ You should see an output like this:
 If you see `CUDA used VRAM` in the output, it means that CUDA support is working on your machine.
 
 ## Prerequisites
-* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher
+* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher
 * [NVIDIA Drivers](https://www.nvidia.com/en-us/drivers/)
 * [`cmake-js` dependencies](https://github.com/cmake-js/cmake-js#:~:text=%5Bstring%5D-,Requirements%3A,-CMake)
 * [CMake](https://cmake.org/download/) 3.26 or higher (optional, recommended if you have build issues)
@@ -83,21 +83,21 @@ To build `node-llama-cpp` with any of these options, set an environment variable
 To fix this issue you have to set the `CUDACXX` environment variable to the path of the `nvcc` compiler,
 and the `CUDA_PATH` environment variable to the path of the CUDA home directory that contains the `nvcc` compiler.
 
-For example, if you have installed CUDA Toolkit 12.4, you have to run a command like this:
+For example, if you have installed CUDA Toolkit 13.1, you have to run a command like this:
 ::: code-group
 ```shell [Linux]
-export CUDACXX=/usr/local/cuda-12.4/bin/nvcc
-export CUDA_PATH=/usr/local/cuda-12.4
+export CUDACXX=/usr/local/cuda-13.1/bin/nvcc
+export CUDA_PATH=/usr/local/cuda-13.1
 ```
 
 ```cmd [Windows (cmd)]
-set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe
-set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
+set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe
+set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1
 ```
 
 ```cmd [Windows (PowerShell)]
-$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe"
-$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
+$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe"
+$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
 ```
 :::
 
 
@@ -76,3 +76,51 @@ const res = await completion.generateInfillCompletion(prefix, suffix, {
 console.log("Fill: " + res);
 ```
 > This example uses [CodeGemma](https://huggingface.co/bartowski/codegemma-2b-GGUF).
+
+## Stop Text Completion Generation {#stop-generation}
+To stop the generation of an ongoing text completion without throwing an error (to get the partially generated text),
+you can use the [`stopOnAbortSignal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#stoponabortsignal) option
+to configure what happens when the given [`signal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#signal) is aborted.
+
+```typescript
+import {fileURLToPath} from "url";
+import path from "path";
+import {getLlama, LlamaCompletion} from "node-llama-cpp";
+
+const __dirname = path.dirname(fileURLToPath(import.meta.url));
+
+const llama = await getLlama();
+const model = await llama.loadModel({
+    modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
+});
+const context = await model.createContext();
+const completion = new LlamaCompletion({
+    contextSequence: context.getSequence()
+});
+
+const abortController = new AbortController();
+const input = "Here is a list of sweet fruits:\n* ";
+console.log("Input: " + input);
+
+let result = "";
+
+process.stdout.write("Streamed completion: ");
+const res = await completion.generateCompletion(input, {
+    maxTokens: 256,
+
+    // stop the generation, instead of cancelling it
+    stopOnAbortSignal: true,
+
+    signal: abortController.signal,
+    onTextChunk(chunk) {
+        result += chunk;
+        process.stdout.write(chunk);
+
+        // max 10 lines
+        if (result.split("\n").length >= 10)
+            abortController.abort();
+    }
+});
+console.log();
+console.log("Completion: " + res);
+```
@@ -84,13 +84,6 @@ else()
     set(NLC_GGML_NATIVE ON)
 endif()
 
-if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
-    find_package(CUDAToolkit)
-    if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
-        set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
-    endif()
-endif()
-
 add_subdirectory("llama.cpp")
 include_directories("llama.cpp")
 include_directories("./llama.cpp/common")
 
@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM
             model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
         }
 
+        if (options.Has("useDirectIo")) {
+            model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();
+        }
+
         if (options.Has("useMlock")) {
             model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
         }
 
@@ -9,6 +9,7 @@ export type BindingModule = {
             gpuLayers?: number,
             vocabOnly?: boolean,
             useMmap?: boolean,
+            useDirectIo?: boolean,
             useMlock?: boolean,
             checkTensors?: boolean,
             onLoadProgress?(loadPercentage: number): void,
 
@@ -72,6 +72,7 @@ type ChatCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -329,6 +330,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -342,7 +348,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
         noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
-        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+        environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo,
+        printTimings
     }) {
         try {
             await RunChat({
@@ -351,7 +358,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
                 temperature, minP, topK, topP, seed,
                 gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
                 maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-                debug, numa, meter, timing, noMmap, printTimings
+                debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -368,7 +375,7 @@ async function RunChat({
     jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
     threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
     repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
-    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: ChatCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -395,6 +402,7 @@ async function RunChat({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -452,6 +460,7 @@ async function RunChat({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -486,6 +495,7 @@ async function RunChat({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -591,6 +601,7 @@ async function RunChat({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         printBos: true,
         printEos: true,
         logBatchSize,
 
@@ -54,6 +54,7 @@ type CompleteCommand = {
     meter: boolean,
     timing: boolean,
     noMmap: boolean,
+    noDirectIo: boolean,
     printTimings: boolean
 };
 
@@ -249,6 +250,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
                 default: false,
                 description: "Disable mmap (memory-mapped file) usage"
             })
+            .option("noDirectIo", {
+                type: "boolean",
+                default: false,
+                description: "Disable Direct I/O usage when available"
+            })
             .option("printTimings", {
                 alias: "pt",
                 type: "boolean",
@@ -261,14 +267,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
         flashAttention, swaFullCache, threads, temperature, minP, topK,
         topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
         repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
-        debug, numa, meter, timing, noMmap, printTimings
+        debug, numa, meter, timing, noMmap, noDirectIo, printTimings
     }) {
         try {
             await RunCompletion({
                 modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
                 threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
                 repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
-                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
+                tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
             });
         } catch (err) {
             await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -283,7 +289,7 @@ async function RunCompletion({
     modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
     threads, temperature, minP, topK, topP, seed, gpuLayers,
     lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
-    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
+    tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
 }: CompleteCommand) {
     if (contextSize === -1) contextSize = undefined;
     if (gpuLayers === -1) gpuLayers = undefined;
@@ -308,6 +314,7 @@ async function RunCompletion({
         });
     const logBatchSize = batchSize != null;
     const useMmap = !noMmap && llama.supportsMmap;
+    const useDirectIo = !noDirectIo;
 
     const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
         flashAttention,
@@ -358,6 +365,7 @@ async function RunCompletion({
                 defaultContextFlashAttention: flashAttention,
                 defaultContextSwaFullCache: swaFullCache,
                 useMmap,
+                useDirectIo,
                 ignoreMemorySafetyChecks: gpuLayers != null,
                 onLoadProgress(loadProgress: number) {
                     progressUpdater.setProgress(loadProgress);
@@ -392,6 +400,7 @@ async function RunCompletion({
                     defaultContextFlashAttention: flashAttention,
                     defaultContextSwaFullCache: swaFullCache,
                     useMmap,
+                    useDirectIo,
                     onLoadProgress(loadProgress: number) {
                         progressUpdater.setProgress(loadProgress);
                     },
@@ -470,6 +479,7 @@ async function RunCompletion({
         context,
         draftContext,
         useMmap,
+        useDirectIo,
         minTitleLength: "Complete".length + 1,
         logBatchSize,
         tokenMeterEnabled: meter
Original file line number	Diff line number	Diff line change
`@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM`
`252`	`252`	`model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();`
`253`	`253`	`}`
`254`	`254`
	`255`	`+ if (options.Has("useDirectIo")) {`
	`256`	`+ model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();`
	`257`	`+ }`
	`258`	`+`
`255`	`259`	`if (options.Has("useMlock")) {`
`256`	`260`	`model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();`
`257`	`261`	`}`