replikativ · whilo · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/bench/iforest_bench.clj b/bench/iforest_bench.clj
@@ -40,29 +40,45 @@
      :p90 (nth sorted (int (* iters 0.9)))}))
 
 (defn- auc-roc
-  "AUC-ROC via trapezoidal rule."
+  "AUC-ROC via trapezoidal rule. Primitive arrays only — no boxing."
   [^doubles scores ^longs labels ^long n]
-  (let [pairs (sort-by first > (mapv (fn [i] [(aget scores i) (aget labels i)]) (range n)))
-        n-pos (areduce labels i s (long 0) (+ s (aget labels i)))
+  (let [n-pos (areduce labels i s (long 0) (+ s (aget labels i)))
         n-neg (- n n-pos)]
     (if (or (zero? n-pos) (zero? n-neg))
       Double/NaN
-      (loop [tp 0.0, fp 0.0, prev-tp 0.0, prev-fp 0.0, auc 0.0, idx 0]
-        (if (>= idx n)
-          (let [tpr (/ tp n-pos) fpr (/ fp n-neg)
-                prev-tpr (/ prev-tp n-pos) prev-fpr (/ prev-fp n-neg)]
-            (+ auc (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))))
-          (let [[_ label] (nth pairs idx)
-                new-tp (if (== label 1) (inc tp) tp)
-                new-fp (if (== label 0) (inc fp) fp)]
-            (if (or (= idx (dec n))
-                    (not= (first (nth pairs idx))
-                           (first (nth pairs (min (inc idx) (dec n))))))
-              (let [tpr (/ new-tp n-pos) fpr (/ new-fp n-neg)
-                    prev-tpr (/ prev-tp n-pos) prev-fpr (/ prev-fp n-neg)
-                    area (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))]
-                (recur new-tp new-fp new-tp new-fp (+ auc area) (inc idx)))
-              (recur new-tp new-fp prev-tp prev-fp auc (inc idx)))))))))
+      ;; Build sorted-scores and sorted-labels arrays (descending by score)
+      (let [^"[Ljava.lang.Integer;" idx-box
+            (let [^"[Ljava.lang.Integer;" a (make-array Integer n)]
+              (dotimes [i n] (aset a i (Integer/valueOf i)))
+              (java.util.Arrays/sort a
+                (reify java.util.Comparator
+                  (compare [_ a b]
+                    (Double/compare (aget scores (int b)) (aget scores (int a))))))
+              a)
+            sorted-scores (double-array n)
+            sorted-labels (long-array n)
+            _ (dotimes [i n]
+                (let [ix (int (aget idx-box i))]
+                  (aset sorted-scores i (aget scores ix))
+                  (aset sorted-labels i (aget labels ix))))
+            inv-n-pos (/ 1.0 (double n-pos))
+            inv-n-neg (/ 1.0 (double n-neg))]
+        (loop [tp 0.0, fp 0.0, prev-tp 0.0, prev-fp 0.0, auc 0.0, i (int 0)]
+          (if (>= i n)
+            (let [tpr (* tp inv-n-pos) fpr (* fp inv-n-neg)
+                  prev-tpr (* prev-tp inv-n-pos) prev-fpr (* prev-fp inv-n-neg)]
+              (+ auc (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))))
+            (let [label (aget sorted-labels i)
+                  new-tp (if (== label 1) (+ tp 1.0) tp)
+                  new-fp (if (== label 0) (+ fp 1.0) fp)
+                  next-i (unchecked-inc-int i)]
+              (if (or (>= next-i n)
+                      (not= (aget sorted-scores i) (aget sorted-scores next-i)))
+                (let [tpr (* new-tp inv-n-pos) fpr (* new-fp inv-n-neg)
+                      prev-tpr (* prev-tp inv-n-pos) prev-fpr (* prev-fp inv-n-neg)
+                      area (* 0.5 (- fpr prev-fpr) (+ tpr prev-tpr))]
+                  (recur new-tp new-fp new-tp new-fp (+ auc area) next-i))
+                (recur new-tp new-fp prev-tp prev-fp auc next-i)))))))))
 
 (defn- load-odds-csv
   "Load ODDS CSV (last col = label)."
@@ -76,8 +92,8 @@
           first-fields (str/split (first data-lines) #",")
           n-cols (count first-fields)
           d (dec n-cols)
-          features (into-array (Class/forName "[D")
-                     (mapv (fn [_] (double-array n)) (range d)))
+          ^"[[D" features (into-array (Class/forName "[D")
+                          (mapv (fn [_] (double-array n)) (range d)))
           labels (long-array n)]
       (doseq [[i line] (map-indexed vector data-lines)]
         (let [fields (str/split line #",")]
@@ -129,6 +145,7 @@
 (defn- bench-odds-dataset [name path expected-auc]
   (if-let [{:keys [features labels n d]} (load-odds-csv path)]
     (let [fmap (features->map features)
+          ^longs labels labels
           t0 (System/nanoTime)
           model (iforest/train {:from fmap :n-trees 100 :sample-size 256 :seed 42})
           train-ms (/ (- (System/nanoTime) t0) 1e6)
@@ -140,9 +157,25 @@
       {:name name :auc auc :n n :d d})
     (println (format "  %-15s SKIPPED (not found: %s)" name path))))
 
+(defn- ensure-odds-data
+  "Download ODDS datasets if not present. Requires python3 with scipy in PATH."
+  []
+  (when-not (.exists (io/file "data/odds/shuttle.csv"))
+    (println "  ODDS data not found — downloading via bin/download-odds ...")
+    (let [script (io/file "bin/download-odds")]
+      (if (.exists script)
+        (let [^ProcessBuilder pb (ProcessBuilder. ^"[Ljava.lang.String;"
+                                   (into-array String ["bash" "bin/download-odds"]))
+              _ (.inheritIO pb)
+              proc (.start pb)
+              exit (.waitFor proc)]
+          (when-not (zero? exit)
+            (println "  WARNING: download-odds failed (exit" exit "). Install scipy: pip install scipy")))
+        (println "  WARNING: bin/download-odds not found")))))
+
 (defn- bench-odds []
   (println "\n=== ODDS Dataset Accuracy ===")
-  (println "  (Run bin/download-odds to fetch datasets)")
+  (ensure-odds-data)
   (bench-odds-dataset "Shuttle"      "data/odds/shuttle.csv"      0.95)
   (bench-odds-dataset "Http"         "data/odds/http.csv"         0.95)
   (bench-odds-dataset "ForestCover"  "data/odds/forestcover.csv"  0.80)
@@ -155,9 +188,22 @@
 ;; ============================================================================
 
 (defn- run-pyod-comparison [dataset-path dataset-name]
-  "Run PyOD IsolationForest on the same dataset and compare AUC."
-  (when (.exists (io/file dataset-path))
-    (let [py-script (str "
+  "Run Stratum and PyOD IsolationForest on the same dataset, print side by side."
+  (if-not (.exists (io/file dataset-path))
+    (println (format "  %-15s SKIPPED (no data: %s — run bin/download-odds)" dataset-name dataset-path))
+    (let [{:keys [features labels n d]} (load-odds-csv dataset-path)
+          fmap (features->map features)
+          ^longs labels labels
+          ;; Stratum
+          t0 (System/nanoTime)
+          model (iforest/train {:from fmap :n-trees 100 :sample-size 256 :seed 42})
+          stratum-train-ms (/ (- (System/nanoTime) t0) 1e6)
+          t1 (System/nanoTime)
+          stratum-scores (iforest/score model fmap)
+          stratum-score-ms (/ (- (System/nanoTime) t1) 1e6)
+          stratum-auc (auc-roc stratum-scores labels n)
+          ;; PyOD
+          py-script (str "
 import sys, time
 import numpy as np
 from sklearn.metrics import roc_auc_score
@@ -173,7 +219,7 @@ clf = IForest(n_estimators=100, max_samples=256, random_state=42)
 clf.fit(X)
 train_time = (time.time() - t0) * 1000
 t0 = time.time()
-scores = clf.decision_scores_
+scores = clf.decision_function(X)
 score_time = (time.time() - t0) * 1000
 auc = roc_auc_score(y, scores)
 print(f'{auc:.4f},{train_time:.1f},{score_time:.1f}')
@@ -183,24 +229,27 @@ print(f'{auc:.4f},{train_time:.1f},{score_time:.1f}')
                                (into-array String ["python3" "-c" py-script]))
           _ (.redirectErrorStream pb true)
           proc (.start pb)
-          output (slurp (.getInputStream proc))
+          output (str/trim (slurp (.getInputStream proc)))
           _ (.waitFor proc)]
-      (let [output (str/trim output)]
-        (cond
-          (str/includes? output "PYOD_NOT_INSTALLED")
-          (println (format "    PyOD %-12s SKIPPED (pip install pyod)" dataset-name))
+      (println (format "  %-15s n=%,7d d=%2d" dataset-name n d))
+      (println (format "    Stratum      AUC=%.4f  train=%s  score=%s"
+                       stratum-auc (fmt-ms stratum-train-ms) (fmt-ms stratum-score-ms)))
+      (cond
+        (str/includes? output "PYOD_NOT_INSTALLED")
+        (println "    PyOD         SKIPPED (pip install pyod)")
 
-          (str/includes? output ",")
-          (let [[auc train score] (str/split output #",")]
-            (println (format "    PyOD %-12s AUC=%.4f train=%sms score=%sms"
-                             dataset-name (Double/parseDouble auc) train score)))
+        (str/includes? output ",")
+        (let [[auc train score] (str/split output #",")]
+          (println (format "    PyOD         AUC=%.4f  train=%7sms  score=%7sms"
+                           (Double/parseDouble auc) (str/trim train) (str/trim score))))
 
-          :else
-          (println (format "    PyOD %-12s ERROR: %s" dataset-name output)))))))
+        :else
+        (println (format "    PyOD         ERROR: %s" output))))))
 
 (defn- bench-pyod []
-  (println "\n=== PyOD Comparison ===")
-  (println "  (pip install pyod scikit-learn to enable)")
+  (println "\n=== Stratum vs PyOD Comparison ===")
+  (println "  (100 trees, 256 sample size, same seed)")
+  (ensure-odds-data)
   (run-pyod-comparison "data/odds/shuttle.csv"     "Shuttle")
   (run-pyod-comparison "data/odds/http.csv"        "Http")
   (run-pyod-comparison "data/odds/forestcover.csv" "ForestCover")
@@ -219,13 +268,13 @@ print(f'{auc:.4f},{train_time:.1f},{score_time:.1f}')
         rng (Random. 42)
         ;; First half: outliers at [10,10,10]
         ;; Second half: outliers shift to [-10,-10,-10]
-        features (into-array (Class/forName "[D")
-                   (mapv (fn [_]
-                           (let [arr (double-array n)]
-                             (dotimes [i n]
-                               (aset arr i (.nextGaussian ^Random rng)))
-                             arr))
-                         (range d)))
+        ^"[[D" features (into-array (Class/forName "[D")
+                        (mapv (fn [_]
+                                (let [arr (double-array n)]
+                                  (dotimes [i n]
+                                    (aset arr i (.nextGaussian ^Random rng)))
+                                  arr))
+                              (range d)))
         labels (long-array n)
         ;; Inject outliers: 1% rate
         _ (dotimes [i 500]

diff --git a/bin/download-odds b/bin/download-odds
@@ -1,95 +1,111 @@
 #!/usr/bin/env bash
 # Download ODDS (Outlier Detection DataSets) for isolation forest benchmarks.
-# Source: https://odds.cs.stonybrook.edu/
+# Source: https://odds.cs.stonybrook.edu/ (canonical Dropbox mirrors)
 #
 # Datasets are in .mat (MATLAB) format. We convert to CSV using Python.
 # Last column is the anomaly label (0=normal, 1=anomaly).
+#
+# Requires: python3, scipy, h5py (pip install scipy h5py)
 
 set -euo pipefail
 
 DATADIR="data/odds"
 mkdir -p "$DATADIR"
 
-echo "Downloading ODDS datasets..."
+# Check python3 + scipy + h5py availability up front
+if ! python3 -c "import scipy.io, h5py" 2>/dev/null; then
+  echo "ERROR: python3 with scipy and h5py is required."
+  echo "  pip install scipy h5py"
+  exit 1
+fi
 
-# Shuttle dataset (49097 rows, 9 features, 7.2% anomalies)
-if [ ! -f "$DATADIR/shuttle.csv" ]; then
-  echo "  Downloading shuttle.mat..."
-  curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/shuttle.mat" -o "$DATADIR/shuttle.mat"
+# Conversion helper: handles both MATLAB v5 (scipy) and v7.3/HDF5 (h5py)
+convert_mat() {
+  local matfile="$1"
+  local csvfile="$2"
   python3 -c "
-import scipy.io, numpy as np
-d = scipy.io.loadmat('$DATADIR/shuttle.mat')
-X, y = d['X'], d['y'].flatten()
+import numpy as np
+try:
+    import scipy.io
+    d = scipy.io.loadmat('$matfile')
+    X, y = d['X'], d['y'].flatten()
+except NotImplementedError:
+    import h5py
+    with h5py.File('$matfile', 'r') as f:
+        X = np.array(f['X']).T  # h5py reads transposed
+        y = np.array(f['y']).flatten()
 data = np.column_stack([X, y])
-np.savetxt('$DATADIR/shuttle.csv', data, delimiter=',', fmt='%.6f')
-print(f'  shuttle: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
+np.savetxt('$csvfile', data, delimiter=',', fmt='%.6f')
+print(f'  {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
 "
+}
+
+echo "Downloading ODDS datasets..."
+
+# Shuttle dataset (49097 rows, 9 features, 7.2% anomalies)
+if [ ! -f "$DATADIR/shuttle.csv" ]; then
+  echo -n "  shuttle: "
+  curl -sL "https://www.dropbox.com/s/mk8ozgisimfn3dw/shuttle.mat?dl=1" -o "$DATADIR/shuttle.mat"
+  convert_mat "$DATADIR/shuttle.mat" "$DATADIR/shuttle.csv"
   rm -f "$DATADIR/shuttle.mat"
 fi
 
 # Http (KDD) dataset (567498 rows, 3 features, 0.4% anomalies)
 if [ ! -f "$DATADIR/http.csv" ]; then
-  echo "  Downloading http.mat..."
-  curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/http.mat" -o "$DATADIR/http.mat"
-  python3 -c "
-import scipy.io, numpy as np
-d = scipy.io.loadmat('$DATADIR/http.mat')
-X, y = d['X'], d['y'].flatten()
-data = np.column_stack([X, y])
-np.savetxt('$DATADIR/http.csv', data, delimiter=',', fmt='%.6f')
-print(f'  http: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
-"
+  echo -n "  http: "
+  curl -sL "https://www.dropbox.com/s/iy9ucsifal754tp/http.mat?dl=1" -o "$DATADIR/http.mat"
+  convert_mat "$DATADIR/http.mat" "$DATADIR/http.csv"
   rm -f "$DATADIR/http.mat"
 fi
 
 # ForestCover dataset (286048 rows, 10 features, 0.96% anomalies)
 if [ ! -f "$DATADIR/forestcover.csv" ]; then
-  echo "  Downloading forestcover.mat..."
-  curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/forestcover.mat" -o "$DATADIR/forestcover.mat"
-  python3 -c "
-import scipy.io, numpy as np
-d = scipy.io.loadmat('$DATADIR/forestcover.mat')
-X, y = d['X'], d['y'].flatten()
-data = np.column_stack([X, y])
-np.savetxt('$DATADIR/forestcover.csv', data, delimiter=',', fmt='%.6f')
-print(f'  forestcover: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
-"
-  rm -f "$DATADIR/forestcover.mat"
+  echo -n "  forestcover: "
+  curl -sL "https://www.dropbox.com/s/awx8iuzbu8dkxf1/cover.mat?dl=1" -o "$DATADIR/cover.mat"
+  convert_mat "$DATADIR/cover.mat" "$DATADIR/forestcover.csv"
+  rm -f "$DATADIR/cover.mat"
 fi
 
 # Mammography dataset (11183 rows, 6 features, 2.32% anomalies)
 if [ ! -f "$DATADIR/mammography.csv" ]; then
-  echo "  Downloading mammography.mat..."
-  curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/mammography.mat" -o "$DATADIR/mammography.mat"
-  python3 -c "
-import scipy.io, numpy as np
-d = scipy.io.loadmat('$DATADIR/mammography.mat')
-X, y = d['X'], d['y'].flatten()
-data = np.column_stack([X, y])
-np.savetxt('$DATADIR/mammography.csv', data, delimiter=',', fmt='%.6f')
-print(f'  mammography: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
-"
+  echo -n "  mammography: "
+  curl -sL "https://www.dropbox.com/s/tq2v4hhwyv17hlk/mammography.mat?dl=1" -o "$DATADIR/mammography.mat"
+  convert_mat "$DATADIR/mammography.mat" "$DATADIR/mammography.csv"
   rm -f "$DATADIR/mammography.mat"
 fi
 
 # Pima Indians Diabetes dataset (768 rows, 8 features, 34.9% anomalies — hard)
 if [ ! -f "$DATADIR/pima.csv" ]; then
-  echo "  Downloading pima.mat..."
-  curl -sL "https://odds.cs.stonybrook.edu/wp-content/uploads/2016/04/pima.mat" -o "$DATADIR/pima.mat"
+  echo -n "  pima: "
+  curl -sL "https://www.dropbox.com/s/mvlwu7p0nyk2a2r/pima.mat?dl=1" -o "$DATADIR/pima.mat"
+  convert_mat "$DATADIR/pima.mat" "$DATADIR/pima.csv"
+  rm -f "$DATADIR/pima.mat"
+fi
+
+# Credit Card Fraud dataset (284807 rows, 30 features, 0.17% anomalies)
+# Source: OpenML mirror of Kaggle mlg-ulb/creditcardfraud
+if [ ! -f "$DATADIR/creditcard.csv" ]; then
+  echo -n "  creditcard: "
+  curl -sL "https://www.openml.org/data/v1/get_csv/1673544" -o "$DATADIR/creditcard_raw.csv"
   python3 -c "
-import scipy.io, numpy as np
-d = scipy.io.loadmat('$DATADIR/pima.mat')
-X, y = d['X'], d['y'].flatten()
-data = np.column_stack([X, y])
-np.savetxt('$DATADIR/pima.csv', data, delimiter=',', fmt='%.6f')
-print(f'  pima: {X.shape[0]} rows, {X.shape[1]} features, {int(y.sum())} anomalies ({100*y.mean():.1f}%)')
+import numpy as np, csv
+with open('$DATADIR/creditcard_raw.csv') as f:
+    reader = csv.reader(f)
+    header = next(reader)
+    rows = list(reader)
+n = len(rows)
+d = len(header) - 1  # last col is Class
+data = np.empty((n, d + 1))
+for i, row in enumerate(rows):
+    for j in range(d):
+        data[i, j] = float(row[j])
+    data[i, d] = float(row[d].strip(\"'\"))
+np.savetxt('$DATADIR/creditcard.csv', data, delimiter=',', fmt='%.6f')
+n_anom = int(data[:, -1].sum())
+print(f'  {n} rows, {d} features, {n_anom} anomalies ({100*n_anom/n:.2f}%)')
 "
-  rm -f "$DATADIR/pima.mat"
+  rm -f "$DATADIR/creditcard_raw.csv"
 fi
 
 echo ""
 echo "Done. ODDS datasets saved to $DATADIR/"
-echo ""
-echo "Optional: For Credit Card fraud dataset (Kaggle, 284K rows, 28 features):"
-echo "  1. Download from https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud"
-echo "  2. Place creditcard.csv in $DATADIR/"