Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions go/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# build artifacts
build/*
80 changes: 80 additions & 0 deletions go/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Makefile for the Magika Go CLI and library.
#
# Build artifacts land in $(BUILD_DIR). Override any variable on the command
# line, e.g.:
# make build ONNXRUNTIME_PREFIX=/opt/onnxruntime
# make run ARGS="-l path/to/file"
# make test MAGIKA_MODEL=standard_v3_3

UNAME_S := $(shell uname -s)

# Default ONNX Runtime prefix by OS. Override with ONNXRUNTIME_PREFIX=...
ifeq ($(UNAME_S),Darwin)
ONNXRUNTIME_PREFIX ?= /opt/homebrew/opt/onnxruntime
RUNTIME_LIB_VAR := DYLD_LIBRARY_PATH
else
ONNXRUNTIME_PREFIX ?= /opt/onnxruntime
RUNTIME_LIB_VAR := LD_LIBRARY_PATH
endif

# Homebrew ships headers under include/onnxruntime/; others use include/.
ifeq ($(UNAME_S),Darwin)
ONNX_INCLUDE ?= $(ONNXRUNTIME_PREFIX)/include/onnxruntime
else
ONNX_INCLUDE ?= $(ONNXRUNTIME_PREFIX)/include
endif
ONNX_LIB ?= $(ONNXRUNTIME_PREFIX)/lib

# Model location. Flags on the CLI ultimately read these at runtime.
MAGIKA_ASSETS_DIR ?= $(CURDIR)/../assets
MAGIKA_MODEL ?= standard_v3_3

BUILD_DIR ?= build
BINARY := $(BUILD_DIR)/magika
GO_TAGS := onnxruntime

# cgo and runtime env baked into every recipe that touches the scanner.
export CGO_CFLAGS := -I$(ONNX_INCLUDE)
export CGO_LDFLAGS := -L$(ONNX_LIB)
export $(RUNTIME_LIB_VAR) := $(ONNX_LIB):$($(RUNTIME_LIB_VAR))
export MAGIKA_ASSETS_DIR
export MAGIKA_MODEL

.PHONY: help build run test bench vet fmt clean

help:
@echo "Targets:"
@echo " build - compile CLI into $(BINARY)"
@echo " run - go run ./cli -- pass args via ARGS=..."
@echo " test - run full test suite (cli + magika)"
@echo " bench - run Benchmark* entries only"
@echo " vet - go vet with onnxruntime tag"
@echo " fmt - gofmt -w on the module"
@echo " clean - remove $(BUILD_DIR)/*"
@echo ""
@echo "Key variables (current values):"
@echo " ONNXRUNTIME_PREFIX = $(ONNXRUNTIME_PREFIX)"
@echo " MAGIKA_ASSETS_DIR = $(MAGIKA_ASSETS_DIR)"
@echo " MAGIKA_MODEL = $(MAGIKA_MODEL)"

build:
@mkdir -p $(BUILD_DIR)
go build -tags $(GO_TAGS) -o $(BINARY) ./cli

run:
go run -tags $(GO_TAGS) ./cli $(ARGS)

test:
go test -tags $(GO_TAGS) ./...

bench:
go test -tags $(GO_TAGS) -run=^$$ -bench=. -benchmem ./cli/...

vet:
go vet -tags $(GO_TAGS) ./...

fmt:
gofmt -w .

clean:
rm -rf $(BUILD_DIR)/*
208 changes: 205 additions & 3 deletions go/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,17 @@ to creating a scanner associated with a given model, and scanning the content.
//
// It requires the onnxruntime and the Magika assets to be accessible.
// onnxruntime is available on https://github.com/microsoft/onnxruntime/releases
// Magika asserts are available on https://github.com/google/magika/tree/main/assets
// Magika assets are available on https://github.com/google/magika/tree/main/assets
//
// Tag and link directives must be provided a build or run time:
// go run -tags onnxruntime -ldflags="-linkmode=external -extldflags=-L/opt/onnxruntime/lib" .
// Tag, cgo, and link directives must be provided at build or run time, e.g.:
// CGO_CFLAGS=-I/opt/onnxruntime/include CGO_LDFLAGS=-L/opt/onnxruntime/lib \
// go run -tags onnxruntime .

package main

import (
"fmt"
"log"
"strings"

"github.com/google/magika/go/magika"
Expand Down Expand Up @@ -56,6 +58,206 @@ func main() {
Inspiration on how to download and install onnxruntime and magika assets can be found in [`docker/Dockerfile`](docker/Dockerfile),
and [`cli/cli.go`](cli/cli.go) provides a somewhat more elaborate usage of the go binding.

## CLI

The [`cli`](./cli) package ships a command-line tool with feature:
recursive directory traversal, JSON/JSONL output, custom format strings,
MIME-type and label modes, score reporting, stdin input via `-`, and
colored output.

### Build & run

The CLI requires cgo and the ONNX Runtime C library. Point the compiler at
your local install and load the matching assets dir and model.

#### Using the Makefile

The `Makefile` wraps the usual env-var plumbing. It auto-detects the OS and
defaults `ONNXRUNTIME_PREFIX` to `/opt/homebrew/opt/onnxruntime` on macOS and
`/opt/onnxruntime` on Linux; `MAGIKA_ASSETS_DIR` defaults to `../assets` and
`MAGIKA_MODEL` to `standard_v3_3`. Override any variable on the command line.

```shell
make help # list targets and current defaults
make build # produces build/magika
make run ARGS="-l path/to/file" # go run ./cli with passthrough args
make run ARGS="--json path/to/file"
make test # full test suite
make bench # Benchmark* only, with -benchmem
make clean # remove build/*

# Override the ONNX Runtime location or model
make build ONNXRUNTIME_PREFIX=/opt/onnxruntime
make run ARGS="-l path/to/file" MAGIKA_MODEL=standard_v3_3
```

The resulting `build/magika` binary needs the ONNX Runtime library resolvable
at runtime (`DYLD_LIBRARY_PATH` on macOS, `LD_LIBRARY_PATH` on Linux); the
`make run` target handles that for you.

#### macOS (Homebrew):

```shell
# Run directly
MAGIKA_ASSETS_DIR=../assets \
MAGIKA_MODEL=standard_v3_3 \
CGO_CFLAGS="-I/opt/homebrew/Cellar/onnxruntime/1.24.4_1/include/onnxruntime" \
CGO_LDFLAGS="-L/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib" \
go run -tags onnxruntime ./cli/... path/to/file

# Build a standalone binary
CGO_CFLAGS="-I/opt/homebrew/Cellar/onnxruntime/1.24.4_1/include/onnxruntime" \
CGO_LDFLAGS="-L/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib" \
go build -tags onnxruntime -o build/magika ./cli

# Run the binary (onnxruntime dylib must be resolvable at runtime)
DYLD_LIBRARY_PATH=/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib \
MAGIKA_ASSETS_DIR=../assets \
MAGIKA_MODEL=standard_v3_3 \
./build/magika path/to/file
```

#### Linux (e.g. onnxruntime extracted to `/opt/onnxruntime`):

```shell
# Run directly
MAGIKA_ASSETS_DIR=../assets \
MAGIKA_MODEL=standard_v3_3 \
CGO_CFLAGS="-I/opt/onnxruntime/include" \
CGO_LDFLAGS="-L/opt/onnxruntime/lib" \
LD_LIBRARY_PATH=/opt/onnxruntime/lib \
go run -tags onnxruntime ./cli/... path/to/file

# Build a standalone binary
CGO_CFLAGS="-I/opt/onnxruntime/include" \
CGO_LDFLAGS="-L/opt/onnxruntime/lib" \
go build -tags onnxruntime -o build/magika ./cli

# Run the binary (onnxruntime .so must be resolvable at runtime)
LD_LIBRARY_PATH=/opt/onnxruntime/lib \
MAGIKA_ASSETS_DIR=../assets \
MAGIKA_MODEL=standard_v3_3 \
./build/magika path/to/file
```

Equivalent invocation using CLI flags instead of environment variables:

```shell
go run -tags onnxruntime ./cli/... \
--assets-dir ../assets \
--model standard_v3_3 \
path/to/file
```

Flags take precedence over `MAGIKA_ASSETS_DIR` and `MAGIKA_MODEL`.

### Flags

| Flag | Description |
|------|-------------|
| `--assets-dir <dir>` | Assets directory (overrides `MAGIKA_ASSETS_DIR`). |
| `--model <name>` | Model name under `assets/models/` (overrides `MAGIKA_MODEL`). |
| `-r`, `--recursive` | Identify files inside directories instead of the directory itself. |
| `--no-dereference` | Treat symbolic links as symlinks rather than following them. |
| `--colors` / `--no-colors` | Force or disable color output. |
| `-s`, `--output-score` | Append the prediction score to each line. |
| `-i`, `--mime-type` | Print the MIME type instead of the description. |
| `-l`, `--label` | Print the short label instead of the description. |
| `--json` | Print results as a JSON array. |
| `--jsonl` | Print results as newline-delimited JSON. |
| `--format <fmt>` | Custom format. Placeholders: `%p %l %d %g %m %e %s %S %b %%`. |

Use `-` as a path to read from standard input (at most once per invocation).
The process exits with status `1` if any path fails to scan.

### Examples

```shell
$ magika --assets-dir assets --model standard_v3_3 tests_data/basic/python/code.py tests_data/basic/zip/magika_test.zip
tests_data/basic/python/code.py: Python source (code)
tests_data/basic/zip/magika_test.zip: Zip archive data (archive)

$ magika --assets-dir assets --model standard_v3_3 -l -s tests_data/basic/python/code.py
tests_data/basic/python/code.py: python 99%

$ magika --assets-dir assets --model standard_v3_3 --json tests_data/basic/python/code.py
[
{
"path": "tests_data/basic/python/code.py",
"result": {
"status": "ok",
"value": {
"dl": { "label": "python", ... },
"output": { "label": "python", ... },
"score": 0.997
}
}
}
]

$ cat tests_data/basic/ini/doc.ini | magika --assets-dir assets --model standard_v3_3 -l -
-: ini
```

### Tests & benchmarks

Tests and benchmarks live under `./cli/...` and require the same cgo setup as
the build. The scanner-backed tests/benchmarks load the assets from
`MAGIKA_ASSETS_DIR` + `MAGIKA_MODEL`; pure-formatter benchmarks run without
them and are skipped by the `bench` command above when unset.

#### Using the Makefile

```shell
make test # full test suite
make bench # Benchmark* only (-benchmem)
make test ONNXRUNTIME_PREFIX=/opt/onnxruntime # override runtime location
```

#### macOS (Homebrew):

```shell
# Run the full test suite
MAGIKA_ASSETS_DIR=../../assets \
MAGIKA_MODEL=standard_v3_3 \
CGO_CFLAGS="-I/opt/homebrew/Cellar/onnxruntime/1.24.4_1/include/onnxruntime" \
CGO_LDFLAGS="-L/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib" \
DYLD_LIBRARY_PATH=/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib \
go test -tags onnxruntime ./cli/...

# Run only the benchmarks
MAGIKA_ASSETS_DIR=../../assets \
MAGIKA_MODEL=standard_v3_3 \
CGO_CFLAGS="-I/opt/homebrew/Cellar/onnxruntime/1.24.4_1/include/onnxruntime" \
CGO_LDFLAGS="-L/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib" \
DYLD_LIBRARY_PATH=/opt/homebrew/Cellar/onnxruntime/1.24.4_1/lib \
go test -tags onnxruntime -run=^$ -bench=. ./cli/...
```

#### Linux (e.g. onnxruntime extracted to `/opt/onnxruntime`):

```shell
# Run the full test suite
MAGIKA_ASSETS_DIR=../../assets \
MAGIKA_MODEL=standard_v3_3 \
CGO_CFLAGS="-I/opt/onnxruntime/include" \
CGO_LDFLAGS="-L/opt/onnxruntime/lib" \
LD_LIBRARY_PATH=/opt/onnxruntime/lib \
go test -tags onnxruntime ./cli/...

# Run only the benchmarks
MAGIKA_ASSETS_DIR=../../assets \
MAGIKA_MODEL=standard_v3_3 \
CGO_CFLAGS="-I/opt/onnxruntime/include" \
CGO_LDFLAGS="-L/opt/onnxruntime/lib" \
LD_LIBRARY_PATH=/opt/onnxruntime/lib \
go test -tags onnxruntime -run=^$ -bench=. ./cli/...
```

`-run=^$` disables the functional tests so only `Benchmark*` entries run. Use
`-benchtime=5s` or `-benchmem` to adjust duration and enable per-op alloc
stats.

## Content
- [`docker`](./docker) contains a sample docker file that builds a
container image that ties together a Magika CLI, an ONNX Runtime,
Expand Down
Loading