Skip to content

Commit 734693d

Browse files
authored
feat(LlamaCompletion): stopOnAbortSignal (#538)
* feat(`LlamaCompletion`): `stopOnAbortSignal` * feat(`LlamaModel`): `useDirectIo` * fix: support new CUDA 13.1 archs * fix: build the prebuilt binaries with CUDA 13.1 instead of 13.0 * docs: stopping a text completion generation
1 parent 7e467cc commit 734693d

18 files changed

Lines changed: 244 additions & 75 deletions

File tree

.github/workflows/build.yml

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ jobs:
9797
- name: Install dependencies on Windows
9898
if: startsWith(matrix.config.os, 'windows')
9999
run: |
100-
choco install cmake.install --version=3.31.1
101-
choco install cmake --version=3.31.1
100+
choco install cmake.install --version=4.2.1
101+
choco install cmake --version=4.2.1
102102
choco install ninja
103103
104104
- name: Install dependencies on Ubuntu (1)
@@ -107,9 +107,9 @@ jobs:
107107
sudo apt-get update
108108
sudo apt-get install ninja-build libtbb-dev g++-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-arm-linux-gnueabihf gcc-arm-linux-gnueabihf
109109
110-
wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
111-
sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
112-
rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
110+
wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz
111+
sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz
112+
rm -f ./cmake-4.2.1-linux-x86_64.tar.gz
113113
114114
which aarch64-linux-gnu-gcc
115115
which aarch64-linux-gnu-g++
@@ -125,31 +125,19 @@ jobs:
125125
sudo apt-get update
126126
sudo apt-get install ninja-build libtbb-dev
127127
128-
wget -c https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-x86_64.tar.gz
129-
sudo tar --strip-components=1 -C /usr/local -xzf cmake-3.31.7-linux-x86_64.tar.gz
130-
rm -f ./cmake-3.31.7-linux-x86_64.tar.gz
128+
wget -c https://github.com/Kitware/CMake/releases/download/v4.2.1/cmake-4.2.1-linux-x86_64.tar.gz
129+
sudo tar --strip-components=1 -C /usr/local -xzf cmake-4.2.1-linux-x86_64.tar.gz
130+
rm -f ./cmake-4.2.1-linux-x86_64.tar.gz
131131
132132
cmake --version
133133
134-
- name: Install Cuda 13.0 on Windows (1)
134+
- name: Install Cuda 13.1 on Windows (1)
135135
if: matrix.config.name == 'Windows (1)'
136-
shell: bash
137-
timeout-minutes: 60
138-
run: |
139-
curl -Lo cuda_13.0.0_windows_network.exe https://developer.download.nvidia.com/compute/cuda/13.0.0/network_installers/cuda_13.0.0_windows_network.exe
140-
141-
echo "Installing Cuda 13.0.0"
142-
powershell -Command "Start-Process -FilePath cuda_13.0.0_windows_network.exe -ArgumentList '-s','-n' -Wait"
143-
echo "Cuda installation finished"
144-
145-
rm -f ./cuda_13.0.0_windows_network.exe
146-
147-
echo "where cudart64_13.dll: $(where cudart64_13.dll)"
148-
149-
echo "CUDA_PATH=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
150-
echo "CUDA_PATH_V13_0=C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0" >> $GITHUB_ENV
151-
echo "CUDA_PATH_VX_Y=CUDA_PATH_V13_0" >> $GITHUB_ENV
152-
echo "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v13.0\\bin" >> $GITHUB_PATH
136+
uses: Jimver/cuda-toolkit@v0.2.30
137+
with:
138+
cuda: '13.1.0'
139+
method: 'network'
140+
use-local-cache: false
153141

154142
- name: Install Cuda 12.4 on Windows (2)
155143
if: matrix.config.name == 'Windows (2)'
@@ -160,11 +148,11 @@ jobs:
160148
sub-packages: '["nvcc", "cudart", "cublas", "cublas_dev", "thrust", "visual_studio_integration"]'
161149
use-local-cache: false
162150

163-
- name: Install Cuda 13.0 on Ubuntu (1)
151+
- name: Install Cuda 13.1 on Ubuntu (1)
164152
if: matrix.config.name == 'Ubuntu (1)'
165-
uses: Jimver/cuda-toolkit@v0.2.27
153+
uses: Jimver/cuda-toolkit@v0.2.30
166154
with:
167-
cuda: '13.0.0'
155+
cuda: '13.1.0'
168156
method: 'network'
169157

170158
- name: Install Cuda 12.4 on Ubuntu (2)

docs/guide/CUDA.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ description: CUDA support in node-llama-cpp
99
and these are automatically used when CUDA is detected on your machine.
1010

1111
To use `node-llama-cpp`'s CUDA support with your NVIDIA GPU,
12-
make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher installed on your machine.
12+
make sure you have [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher installed on your machine.
1313

1414
If the pre-built binaries don't work with your CUDA installation,
1515
`node-llama-cpp` will automatically download a release of `llama.cpp` and build it from source with CUDA support.
1616
Building from source with CUDA support is slow and can take up to an hour.
1717

18-
The pre-built binaries are compiled with CUDA Toolkit 12.4,
19-
so any version of CUDA Toolkit that is 12.4 or higher should work with the pre-built binaries.
18+
The pre-built binaries are compiled with CUDA Toolkits 12.4 and 13.1,
19+
so any CUDA Toolkit 12 that's on version 12.4 or higher or CUDA Toolkit 13 on version 13.1 or higher should work with the pre-built binaries.
2020
If you have an older version of CUDA Toolkit installed on your machine,
2121
consider updating it to avoid having to wait the long build time.
2222

@@ -42,7 +42,7 @@ You should see an output like this:
4242
If you see `CUDA used VRAM` in the output, it means that CUDA support is working on your machine.
4343

4444
## Prerequisites
45-
* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 12.4 or higher
45+
* [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads) 13.1 or higher
4646
* [NVIDIA Drivers](https://www.nvidia.com/en-us/drivers/)
4747
* [`cmake-js` dependencies](https://github.com/cmake-js/cmake-js#:~:text=%5Bstring%5D-,Requirements%3A,-CMake)
4848
* [CMake](https://cmake.org/download/) 3.26 or higher (optional, recommended if you have build issues)
@@ -83,21 +83,21 @@ To build `node-llama-cpp` with any of these options, set an environment variable
8383
To fix this issue you have to set the `CUDACXX` environment variable to the path of the `nvcc` compiler,
8484
and the `CUDA_PATH` environment variable to the path of the CUDA home directory that contains the `nvcc` compiler.
8585

86-
For example, if you have installed CUDA Toolkit 12.4, you have to run a command like this:
86+
For example, if you have installed CUDA Toolkit 13.1, you have to run a command like this:
8787
::: code-group
8888
```shell [Linux]
89-
export CUDACXX=/usr/local/cuda-12.4/bin/nvcc
90-
export CUDA_PATH=/usr/local/cuda-12.4
89+
export CUDACXX=/usr/local/cuda-13.1/bin/nvcc
90+
export CUDA_PATH=/usr/local/cuda-13.1
9191
```
9292

9393
```cmd [Windows (cmd)]
94-
set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe
95-
set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4
94+
set CUDACXX=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe
95+
set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1
9696
```
9797

9898
```cmd [Windows (PowerShell)]
99-
$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4\bin\nvcc.exe"
100-
$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.4"
99+
$env:CUDACXX="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1\bin\nvcc.exe"
100+
$env:CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v13.1"
101101
```
102102
:::
103103

docs/guide/text-completion.md

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,51 @@ const res = await completion.generateInfillCompletion(prefix, suffix, {
7676
console.log("Fill: " + res);
7777
```
7878
> This example uses [CodeGemma](https://huggingface.co/bartowski/codegemma-2b-GGUF).
79+
80+
## Stop Text Completion Generation {#stop-generation}
81+
To stop the generation of an ongoing text completion without throwing an error (to get the partially generated text),
82+
you can use the [`stopOnAbortSignal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#stoponabortsignal) option
83+
to configure what happens when the given [`signal`](../api/type-aliases/LlamaCompletionGenerationOptions.md#signal) is aborted.
84+
85+
```typescript
86+
import {fileURLToPath} from "url";
87+
import path from "path";
88+
import {getLlama, LlamaCompletion} from "node-llama-cpp";
89+
90+
const __dirname = path.dirname(fileURLToPath(import.meta.url));
91+
92+
const llama = await getLlama();
93+
const model = await llama.loadModel({
94+
modelPath: path.join(__dirname, "models", "Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf")
95+
});
96+
const context = await model.createContext();
97+
const completion = new LlamaCompletion({
98+
contextSequence: context.getSequence()
99+
});
100+
101+
const abortController = new AbortController();
102+
const input = "Here is a list of sweet fruits:\n* ";
103+
console.log("Input: " + input);
104+
105+
let result = "";
106+
107+
process.stdout.write("Streamed completion: ");
108+
const res = await completion.generateCompletion(input, {
109+
maxTokens: 256,
110+
111+
// stop the generation, instead of cancelling it
112+
stopOnAbortSignal: true,
113+
114+
signal: abortController.signal,
115+
onTextChunk(chunk) {
116+
result += chunk;
117+
process.stdout.write(chunk);
118+
119+
// max 10 lines
120+
if (result.split("\n").length >= 10)
121+
abortController.abort();
122+
}
123+
});
124+
console.log();
125+
console.log("Completion: " + res);
126+
```

llama/CMakeLists.txt

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,13 +84,6 @@ else()
8484
set(NLC_GGML_NATIVE ON)
8585
endif()
8686

87-
if (GGML_CUDA AND NOT DEFINED CMAKE_CUDA_ARCHITECTURES AND NOT NLC_GGML_NATIVE)
88-
find_package(CUDAToolkit)
89-
if (CUDAToolkit_FOUND AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "13.0")
90-
set(CMAKE_CUDA_ARCHITECTURES "75-virtual;80-virtual;86-real;89-real;90-real")
91-
endif()
92-
endif()
93-
9487
add_subdirectory("llama.cpp")
9588
include_directories("llama.cpp")
9689
include_directories("./llama.cpp/common")

llama/addon/AddonModel.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,10 @@ AddonModel::AddonModel(const Napi::CallbackInfo& info) : Napi::ObjectWrap<AddonM
252252
model_params.use_mmap = options.Get("useMmap").As<Napi::Boolean>().Value();
253253
}
254254

255+
if (options.Has("useDirectIo")) {
256+
model_params.use_direct_io = options.Get("useDirectIo").As<Napi::Boolean>().Value();
257+
}
258+
255259
if (options.Has("useMlock")) {
256260
model_params.use_mlock = options.Get("useMlock").As<Napi::Boolean>().Value();
257261
}

src/bindings/AddonTypes.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ export type BindingModule = {
99
gpuLayers?: number,
1010
vocabOnly?: boolean,
1111
useMmap?: boolean,
12+
useDirectIo?: boolean,
1213
useMlock?: boolean,
1314
checkTensors?: boolean,
1415
onLoadProgress?(loadPercentage: number): void,

src/cli/commands/ChatCommand.ts

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ type ChatCommand = {
7272
meter: boolean,
7373
timing: boolean,
7474
noMmap: boolean,
75+
noDirectIo: boolean,
7576
printTimings: boolean
7677
};
7778

@@ -329,6 +330,11 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
329330
default: false,
330331
description: "Disable mmap (memory-mapped file) usage"
331332
})
333+
.option("noDirectIo", {
334+
type: "boolean",
335+
default: false,
336+
description: "Disable Direct I/O usage when available"
337+
})
332338
.option("printTimings", {
333339
alias: "pt",
334340
type: "boolean",
@@ -342,7 +348,8 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
342348
noTrimWhitespace, grammar, jsonSchemaGrammarFile, threads, temperature, minP, topK,
343349
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
344350
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory,
345-
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
351+
environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo,
352+
printTimings
346353
}) {
347354
try {
348355
await RunChat({
@@ -351,7 +358,7 @@ export const ChatCommand: CommandModule<object, ChatCommand> = {
351358
temperature, minP, topK, topP, seed,
352359
gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
353360
maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel, tokenPredictionModelContextSize,
354-
debug, numa, meter, timing, noMmap, printTimings
361+
debug, numa, meter, timing, noMmap, noDirectIo, printTimings
355362
});
356363
} catch (err) {
357364
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -368,7 +375,7 @@ async function RunChat({
368375
jsonSchemaGrammarFile: jsonSchemaGrammarFilePath,
369376
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine,
370377
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, reasoningBudget, noHistory, environmentFunctions, tokenPredictionDraftModel,
371-
tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
378+
tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
372379
}: ChatCommand) {
373380
if (contextSize === -1) contextSize = undefined;
374381
if (gpuLayers === -1) gpuLayers = undefined;
@@ -395,6 +402,7 @@ async function RunChat({
395402
});
396403
const logBatchSize = batchSize != null;
397404
const useMmap = !noMmap && llama.supportsMmap;
405+
const useDirectIo = !noDirectIo;
398406

399407
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
400408
flashAttention,
@@ -452,6 +460,7 @@ async function RunChat({
452460
defaultContextFlashAttention: flashAttention,
453461
defaultContextSwaFullCache: swaFullCache,
454462
useMmap,
463+
useDirectIo,
455464
ignoreMemorySafetyChecks: gpuLayers != null,
456465
onLoadProgress(loadProgress: number) {
457466
progressUpdater.setProgress(loadProgress);
@@ -486,6 +495,7 @@ async function RunChat({
486495
defaultContextFlashAttention: flashAttention,
487496
defaultContextSwaFullCache: swaFullCache,
488497
useMmap,
498+
useDirectIo,
489499
onLoadProgress(loadProgress: number) {
490500
progressUpdater.setProgress(loadProgress);
491501
},
@@ -591,6 +601,7 @@ async function RunChat({
591601
context,
592602
draftContext,
593603
useMmap,
604+
useDirectIo,
594605
printBos: true,
595606
printEos: true,
596607
logBatchSize,

src/cli/commands/CompleteCommand.ts

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ type CompleteCommand = {
5454
meter: boolean,
5555
timing: boolean,
5656
noMmap: boolean,
57+
noDirectIo: boolean,
5758
printTimings: boolean
5859
};
5960

@@ -249,6 +250,11 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
249250
default: false,
250251
description: "Disable mmap (memory-mapped file) usage"
251252
})
253+
.option("noDirectIo", {
254+
type: "boolean",
255+
default: false,
256+
description: "Disable Direct I/O usage when available"
257+
})
252258
.option("printTimings", {
253259
alias: "pt",
254260
type: "boolean",
@@ -261,14 +267,14 @@ export const CompleteCommand: CommandModule<object, CompleteCommand> = {
261267
flashAttention, swaFullCache, threads, temperature, minP, topK,
262268
topP, seed, gpuLayers, repeatPenalty, lastTokensRepeatPenalty, penalizeRepeatingNewLine,
263269
repeatFrequencyPenalty, repeatPresencePenalty, maxTokens, tokenPredictionDraftModel, tokenPredictionModelContextSize,
264-
debug, numa, meter, timing, noMmap, printTimings
270+
debug, numa, meter, timing, noMmap, noDirectIo, printTimings
265271
}) {
266272
try {
267273
await RunCompletion({
268274
modelPath, header, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
269275
threads, temperature, minP, topK, topP, seed, gpuLayers, lastTokensRepeatPenalty,
270276
repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty, maxTokens,
271-
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, printTimings
277+
tokenPredictionDraftModel, tokenPredictionModelContextSize, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
272278
});
273279
} catch (err) {
274280
await new Promise((accept) => setTimeout(accept, 0)); // wait for logs to finish printing
@@ -283,7 +289,7 @@ async function RunCompletion({
283289
modelPath: modelArg, header: headerArg, gpu, systemInfo, text, textFile, contextSize, batchSize, flashAttention, swaFullCache,
284290
threads, temperature, minP, topK, topP, seed, gpuLayers,
285291
lastTokensRepeatPenalty, repeatPenalty, penalizeRepeatingNewLine, repeatFrequencyPenalty, repeatPresencePenalty,
286-
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, printTimings
292+
tokenPredictionDraftModel, tokenPredictionModelContextSize, maxTokens, debug, numa, meter, timing, noMmap, noDirectIo, printTimings
287293
}: CompleteCommand) {
288294
if (contextSize === -1) contextSize = undefined;
289295
if (gpuLayers === -1) gpuLayers = undefined;
@@ -308,6 +314,7 @@ async function RunCompletion({
308314
});
309315
const logBatchSize = batchSize != null;
310316
const useMmap = !noMmap && llama.supportsMmap;
317+
const useDirectIo = !noDirectIo;
311318

312319
const resolvedModelPath = await resolveCommandGgufPath(modelArg, llama, headers, {
313320
flashAttention,
@@ -358,6 +365,7 @@ async function RunCompletion({
358365
defaultContextFlashAttention: flashAttention,
359366
defaultContextSwaFullCache: swaFullCache,
360367
useMmap,
368+
useDirectIo,
361369
ignoreMemorySafetyChecks: gpuLayers != null,
362370
onLoadProgress(loadProgress: number) {
363371
progressUpdater.setProgress(loadProgress);
@@ -392,6 +400,7 @@ async function RunCompletion({
392400
defaultContextFlashAttention: flashAttention,
393401
defaultContextSwaFullCache: swaFullCache,
394402
useMmap,
403+
useDirectIo,
395404
onLoadProgress(loadProgress: number) {
396405
progressUpdater.setProgress(loadProgress);
397406
},
@@ -470,6 +479,7 @@ async function RunCompletion({
470479
context,
471480
draftContext,
472481
useMmap,
482+
useDirectIo,
473483
minTitleLength: "Complete".length + 1,
474484
logBatchSize,
475485
tokenMeterEnabled: meter

0 commit comments

Comments
 (0)