Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 95 additions & 46 deletions bench/iforest_bench.clj
Original file line number Diff line number Diff line change
Expand Up @@ -40,29 +40,45 @@
:p90 (nth sorted (int (* iters 0.9)))}))

(defn- auc-roc
"AUC-ROC via trapezoidal rule."
"AUC-ROC via trapezoidal rule. Primitive arrays only — no boxing."
[^doubles scores ^longs labels ^long n]
(let [pairs (sort-by first > (mapv (fn [i] [(aget scores i) (aget labels i)]) (range n)))
n-pos (areduce labels i s (long 0) (+ s (aget labels i)))
(let [n-pos (areduce labels i s (long 0) (+ s (aget labels i)))
n-neg (- n n-pos)]
(if (or (zero? n-pos) (zero? n-neg))
Double/NaN
(loop [tp 0.0, fp 0.0, prev-tp 0.0, prev-fp 0.0, auc 0.0, idx 0]
(if (>= idx n)
(let [tpr (/ tp n-pos) fpr (/ fp n-neg)
prev-tpr (/ prev-tp n-pos) prev-fpr (/ prev-fp n-neg)]
(+ auc (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))))
(let [[_ label] (nth pairs idx)
new-tp (if (== label 1) (inc tp) tp)
new-fp (if (== label 0) (inc fp) fp)]
(if (or (= idx (dec n))
(not= (first (nth pairs idx))
(first (nth pairs (min (inc idx) (dec n))))))
(let [tpr (/ new-tp n-pos) fpr (/ new-fp n-neg)
prev-tpr (/ prev-tp n-pos) prev-fpr (/ prev-fp n-neg)
area (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))]
(recur new-tp new-fp new-tp new-fp (+ auc area) (inc idx)))
(recur new-tp new-fp prev-tp prev-fp auc (inc idx)))))))))
;; Build sorted-scores and sorted-labels arrays (descending by score)
(let [^"[Ljava.lang.Integer;" idx-box
(let [^"[Ljava.lang.Integer;" a (make-array Integer n)]
(dotimes [i n] (aset a i (Integer/valueOf i)))
(java.util.Arrays/sort a
(reify java.util.Comparator
(compare [_ a b]
(Double/compare (aget scores (int b)) (aget scores (int a))))))
a)
sorted-scores (double-array n)
sorted-labels (long-array n)
_ (dotimes [i n]
(let [ix (int (aget idx-box i))]
(aset sorted-scores i (aget scores ix))
(aset sorted-labels i (aget labels ix))))
inv-n-pos (/ 1.0 (double n-pos))
inv-n-neg (/ 1.0 (double n-neg))]
(loop [tp 0.0, fp 0.0, prev-tp 0.0, prev-fp 0.0, auc 0.0, i (int 0)]
(if (>= i n)
(let [tpr (* tp inv-n-pos) fpr (* fp inv-n-neg)
prev-tpr (* prev-tp inv-n-pos) prev-fpr (* prev-fp inv-n-neg)]
(+ auc (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))))
(let [label (aget sorted-labels i)
new-tp (if (== label 1) (+ tp 1.0) tp)
new-fp (if (== label 0) (+ fp 1.0) fp)
next-i (unchecked-inc-int i)]
(if (or (>= next-i n)
(not= (aget sorted-scores i) (aget sorted-scores next-i)))
(let [tpr (* new-tp inv-n-pos) fpr (* new-fp inv-n-neg)
prev-tpr (* prev-tp inv-n-pos) prev-fpr (* prev-fp inv-n-neg)
area (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))]
(recur new-tp new-fp new-tp new-fp (+ auc area) next-i))
(recur new-tp new-fp prev-tp prev-fp auc next-i)))))))))

(defn- load-odds-csv
"Load ODDS CSV (last col = label)."
Expand All @@ -76,8 +92,8 @@
first-fields (str/split (first data-lines) #",")
n-cols (count first-fields)
d (dec n-cols)
features (into-array (Class/forName "[D")
(mapv (fn [_] (double-array n)) (range d)))
^"[[D" features (into-array (Class/forName "[D")
(mapv (fn [_] (double-array n)) (range d)))
labels (long-array n)]
(doseq [[i line] (map-indexed vector data-lines)]
(let [fields (str/split line #",")]
Expand Down Expand Up @@ -129,6 +145,7 @@
(defn- bench-odds-dataset [name path expected-auc]
(if-let [{:keys [features labels n d]} (load-odds-csv path)]
(let [fmap (features->map features)
^longs labels labels
t0 (System/nanoTime)
model (iforest/train {:from fmap :n-trees 100 :sample-size 256 :seed 42})
train-ms (/ (- (System/nanoTime) t0) 1e6)
Expand All @@ -140,9 +157,25 @@
{:name name :auc auc :n n :d d})
(println (format " %-15s SKIPPED (not found: %s)" name path))))

(defn- ensure-odds-data
"Download ODDS datasets if not present. Requires python3 with scipy in PATH."
[]
(when-not (.exists (io/file "data/odds/shuttle.csv"))
(println " ODDS data not found — downloading via bin/download-odds ...")
(let [script (io/file "bin/download-odds")]
(if (.exists script)
(let [^ProcessBuilder pb (ProcessBuilder. ^"[Ljava.lang.String;"
(into-array String ["bash" "bin/download-odds"]))
_ (.inheritIO pb)
proc (.start pb)
exit (.waitFor proc)]
(when-not (zero? exit)
(println " WARNING: download-odds failed (exit" exit "). Install scipy: pip install scipy")))
(println " WARNING: bin/download-odds not found")))))

(defn- bench-odds []
(println "\n=== ODDS Dataset Accuracy ===")
(println " (Run bin/download-odds to fetch datasets)")
(ensure-odds-data)
(bench-odds-dataset "Shuttle" "data/odds/shuttle.csv" 0.95)
(bench-odds-dataset "Http" "data/odds/http.csv" 0.95)
(bench-odds-dataset "ForestCover" "data/odds/forestcover.csv" 0.80)
Expand All @@ -155,9 +188,22 @@
;; ============================================================================

(defn- run-pyod-comparison [dataset-path dataset-name]
"Run PyOD IsolationForest on the same dataset and compare AUC."
(when (.exists (io/file dataset-path))
(let [py-script (str "
"Run Stratum and PyOD IsolationForest on the same dataset, print side by side."
(if-not (.exists (io/file dataset-path))
(println (format " %-15s SKIPPED (no data: %s — run bin/download-odds)" dataset-name dataset-path))
(let [{:keys [features labels n d]} (load-odds-csv dataset-path)
fmap (features->map features)
^longs labels labels
;; Stratum
t0 (System/nanoTime)
model (iforest/train {:from fmap :n-trees 100 :sample-size 256 :seed 42})
stratum-train-ms (/ (- (System/nanoTime) t0) 1e6)
t1 (System/nanoTime)
stratum-scores (iforest/score model fmap)
stratum-score-ms (/ (- (System/nanoTime) t1) 1e6)
stratum-auc (auc-roc stratum-scores labels n)
;; PyOD
py-script (str "
import sys, time
import numpy as np
from sklearn.metrics import roc_auc_score
Expand All @@ -173,7 +219,7 @@ clf = IForest(n_estimators=100, max_samples=256, random_state=42)
clf.fit(X)
train_time = (time.time() - t0) * 1000
t0 = time.time()
scores = clf.decision_scores_
scores = clf.decision_function(X)
score_time = (time.time() - t0) * 1000
auc = roc_auc_score(y, scores)
print(f'{auc:.4f},{train_time:.1f},{score_time:.1f}')
Expand All @@ -183,24 +229,27 @@ print(f'{auc:.4f},{train_time:.1f},{score_time:.1f}')
(into-array String ["python3" "-c" py-script]))
_ (.redirectErrorStream pb true)
proc (.start pb)
output (slurp (.getInputStream proc))
output (str/trim (slurp (.getInputStream proc)))
_ (.waitFor proc)]
(let [output (str/trim output)]
(cond
(str/includes? output "PYOD_NOT_INSTALLED")
(println (format " PyOD %-12s SKIPPED (pip install pyod)" dataset-name))
(println (format " %-15s n=%,7d d=%2d" dataset-name n d))
(println (format " Stratum AUC=%.4f train=%s score=%s"
stratum-auc (fmt-ms stratum-train-ms) (fmt-ms stratum-score-ms)))
(cond
(str/includes? output "PYOD_NOT_INSTALLED")
(println " PyOD SKIPPED (pip install pyod)")

(str/includes? output ",")
(let [[auc train score] (str/split output #",")]
(println (format " PyOD %-12s AUC=%.4f train=%sms score=%sms"
dataset-name (Double/parseDouble auc) train score)))
(str/includes? output ",")
(let [[auc train score] (str/split output #",")]
(println (format " PyOD AUC=%.4f train=%7sms score=%7sms"
(Double/parseDouble auc) (str/trim train) (str/trim score))))

:else
(println (format " PyOD %-12s ERROR: %s" dataset-name output)))))))
:else
(println (format " PyOD ERROR: %s" output))))))

(defn- bench-pyod []
(println "\n=== PyOD Comparison ===")
(println " (pip install pyod scikit-learn to enable)")
(println "\n=== Stratum vs PyOD Comparison ===")
(println " (100 trees, 256 sample size, same seed)")
(ensure-odds-data)
(run-pyod-comparison "data/odds/shuttle.csv" "Shuttle")
(run-pyod-comparison "data/odds/http.csv" "Http")
(run-pyod-comparison "data/odds/forestcover.csv" "ForestCover")
Expand All @@ -219,13 +268,13 @@ print(f'{auc:.4f},{train_time:.1f},{score_time:.1f}')
rng (Random. 42)
;; First half: outliers at [10,10,10]
;; Second half: outliers shift to [-10,-10,-10]
features (into-array (Class/forName "[D")
(mapv (fn [_]
(let [arr (double-array n)]
(dotimes [i n]
(aset arr i (.nextGaussian ^Random rng)))
arr))
(range d)))
^"[[D" features (into-array (Class/forName "[D")
(mapv (fn [_]
(let [arr (double-array n)]
(dotimes [i n]
(aset arr i (.nextGaussian ^Random rng)))
arr))
(range d)))
labels (long-array n)
;; Inject outliers: 1% rate
_ (dotimes [i 500]
Expand Down
126 changes: 71 additions & 55 deletions bin/download-odds
Original file line number Diff line number Diff line change
@@ -1,95 +1,111 @@
#!/usr/bin/env bash
# Download ODDS (Outlier Detection DataSets) for isolation forest benchmarks.
# Source: https://odds.cs.stonybrook.edu/
# Source: https://odds.cs.stonybrook.edu/ (canonical Dropbox mirrors)
#
# Datasets are in .mat (MATLAB) format. We convert to CSV using Python.
# Last column is the anomaly label (0=normal, 1=anomaly).
#
# Requires: python3, scipy, h5py (pip install scipy h5py)

set -euo pipefail

DATADIR="data/odds"
mkdir -p "$DATADIR"

echo "Downloading ODDS datasets..."
# Check python3 + scipy + h5py availability up front
if ! python3 -c "import scipy.io, h5py" 2>/dev/null; then
echo "ERROR: python3 with scipy and h5py is required."
echo " pip install scipy h5py"
exit 1
fi

# Shuttle dataset (49097 rows, 9 features, 7.2% anomalies)
if [ ! -f "$DATADIR/shuttle.csv" ]; then
echo " Downloading shuttle.mat..."
curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/shuttle.mat" -o "$DATADIR/shuttle.mat"
# Conversion helper: handles both MATLAB v5 (scipy) and v7.3/HDF5 (h5py)
convert_mat() {
local matfile="$1"
local csvfile="$2"
python3 -c "
import scipy.io, numpy as np
d = scipy.io.loadmat('$DATADIR/shuttle.mat')
X, y = d['X'], d['y'].flatten()
import numpy as np
try:
import scipy.io
d = scipy.io.loadmat('$matfile')
X, y = d['X'], d['y'].flatten()
except NotImplementedError:
import h5py
with h5py.File('$matfile', 'r') as f:
X = np.array(f['X']).T # h5py reads transposed
y = np.array(f['y']).flatten()
data = np.column_stack([X, y])
np.savetxt('$DATADIR/shuttle.csv', data, delimiter=',', fmt='%.6f')
print(f' shuttle: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
np.savetxt('$csvfile', data, delimiter=',', fmt='%.6f')
print(f' {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
"
}

echo "Downloading ODDS datasets..."

# Shuttle dataset (49097 rows, 9 features, 7.2% anomalies)
if [ ! -f "$DATADIR/shuttle.csv" ]; then
echo -n " shuttle: "
curl -sL "https://www.dropbox.com/s/mk8ozgisimfn3dw/shuttle.mat?dl=1" -o "$DATADIR/shuttle.mat"
convert_mat "$DATADIR/shuttle.mat" "$DATADIR/shuttle.csv"
rm -f "$DATADIR/shuttle.mat"
fi

# Http (KDD) dataset (567498 rows, 3 features, 0.4% anomalies)
if [ ! -f "$DATADIR/http.csv" ]; then
echo " Downloading http.mat..."
curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/http.mat" -o "$DATADIR/http.mat"
python3 -c "
import scipy.io, numpy as np
d = scipy.io.loadmat('$DATADIR/http.mat')
X, y = d['X'], d['y'].flatten()
data = np.column_stack([X, y])
np.savetxt('$DATADIR/http.csv', data, delimiter=',', fmt='%.6f')
print(f' http: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
"
echo -n " http: "
curl -sL "https://www.dropbox.com/s/iy9ucsifal754tp/http.mat?dl=1" -o "$DATADIR/http.mat"
convert_mat "$DATADIR/http.mat" "$DATADIR/http.csv"
rm -f "$DATADIR/http.mat"
fi

# ForestCover dataset (286048 rows, 10 features, 0.96% anomalies)
if [ ! -f "$DATADIR/forestcover.csv" ]; then
echo " Downloading forestcover.mat..."
curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/forestcover.mat" -o "$DATADIR/forestcover.mat"
python3 -c "
import scipy.io, numpy as np
d = scipy.io.loadmat('$DATADIR/forestcover.mat')
X, y = d['X'], d['y'].flatten()
data = np.column_stack([X, y])
np.savetxt('$DATADIR/forestcover.csv', data, delimiter=',', fmt='%.6f')
print(f' forestcover: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
"
rm -f "$DATADIR/forestcover.mat"
echo -n " forestcover: "
curl -sL "https://www.dropbox.com/s/awx8iuzbu8dkxf1/cover.mat?dl=1" -o "$DATADIR/cover.mat"
convert_mat "$DATADIR/cover.mat" "$DATADIR/forestcover.csv"
rm -f "$DATADIR/cover.mat"
fi

# Mammography dataset (11183 rows, 6 features, 2.32% anomalies)
if [ ! -f "$DATADIR/mammography.csv" ]; then
echo " Downloading mammography.mat..."
curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/mammography.mat" -o "$DATADIR/mammography.mat"
python3 -c "
import scipy.io, numpy as np
d = scipy.io.loadmat('$DATADIR/mammography.mat')
X, y = d['X'], d['y'].flatten()
data = np.column_stack([X, y])
np.savetxt('$DATADIR/mammography.csv', data, delimiter=',', fmt='%.6f')
print(f' mammography: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
"
echo -n " mammography: "
curl -sL "https://www.dropbox.com/s/tq2v4hhwyv17hlk/mammography.mat?dl=1" -o "$DATADIR/mammography.mat"
convert_mat "$DATADIR/mammography.mat" "$DATADIR/mammography.csv"
rm -f "$DATADIR/mammography.mat"
fi

# Pima Indians Diabetes dataset (768 rows, 8 features, 34.9% anomalies — hard)
if [ ! -f "$DATADIR/pima.csv" ]; then
echo " Downloading pima.mat..."
curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/pima.mat" -o "$DATADIR/pima.mat"
echo -n " pima: "
curl -sL "https://www.dropbox.com/s/mvlwu7p0nyk2a2r/pima.mat?dl=1" -o "$DATADIR/pima.mat"
convert_mat "$DATADIR/pima.mat" "$DATADIR/pima.csv"
rm -f "$DATADIR/pima.mat"
fi

# Credit Card Fraud dataset (284807 rows, 30 features, 0.17% anomalies)
# Source: OpenML mirror of Kaggle mlg-ulb/creditcardfraud
if [ ! -f "$DATADIR/creditcard.csv" ]; then
echo -n " creditcard: "
curl -sL "https://www.openml.org/data/v1/get_csv/1673544" -o "$DATADIR/creditcard_raw.csv"
python3 -c "
import scipy.io, numpy as np
d = scipy.io.loadmat('$DATADIR/pima.mat')
X, y = d['X'], d['y'].flatten()
data = np.column_stack([X, y])
np.savetxt('$DATADIR/pima.csv', data, delimiter=',', fmt='%.6f')
print(f' pima: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
import numpy as np, csv
with open('$DATADIR/creditcard_raw.csv') as f:
reader = csv.reader(f)
header = next(reader)
rows = list(reader)
n = len(rows)
d = len(header) - 1 # last col is Class
data = np.empty((n, d + 1))
for i, row in enumerate(rows):
for j in range(d):
data[i, j] = float(row[j])
data[i, d] = float(row[d].strip(\"'\"))
np.savetxt('$DATADIR/creditcard.csv', data, delimiter=',', fmt='%.6f')
n_anom = int(data[:, -1].sum())
print(f' {n} rows, {d} features, {n_anom} anomalies ({100*n_anom/n:.2f}%)')
"
rm -f "$DATADIR/pima.mat"
rm -f "$DATADIR/creditcard_raw.csv"
fi

echo ""
echo "Done. ODDS datasets saved to $DATADIR/"
echo ""
echo "Optional: For Credit Card fraud dataset (Kaggle, 284K rows, 28 features):"
echo " 1. Download from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud"
echo " 2. Place creditcard.csv in $DATADIR/"
Loading
Loading