fix: complete the eval metrics Truth_Ratio calculation mentioned in the paper

Yohjishong · Yohjishong · commit 8fa2b1653441 · 2025-12-10T14:48:57.000+08:00
diff --git a/configs/eval/tofu.yaml b/configs/eval/tofu.yaml
@@ -4,6 +4,7 @@
 defaults: # include all defined metrics files
   - tofu_metrics: # When you import a metric here, its configuration automatically populates the 
   # metric key below, enabled by the @package directive at the top of each configuration file.
+    - forget_Truth_Ratio
     - forget_quality
     - forget_Q_A_Prob
     - forget_Q_A_ROUGE
diff --git a/configs/eval/tofu_metrics/forget_Truth_Ratio.yaml b/configs/eval/tofu_metrics/forget_Truth_Ratio.yaml
@@ -10,4 +10,4 @@ pre_compute:
     access_key: wrong
 
 handler: truth_ratio
-aggregator: closer_to_1_better
+aggregator: prob_mean
diff --git a/src/evals/metrics/memorization.py b/src/evals/metrics/memorization.py
@@ -118,11 +118,21 @@ def closer_to_1_better(arr):
     # 1-tr is higher.
     def true_better(arr):
         return np.mean(np.maximum(0, 1 - arr))
+   
+   # NEW: Use correctness probability: correct / (correct + wrong), higher is better
+    def prob_mean(arr):
+        # arr here will be the new truth_ratios = correct / (correct + wrong)
+        return np.mean(arr)
 
     if kwargs["aggregator"] == "closer_to_1_better":
+        use_original_ratio = True
         aggregator = closer_to_1_better
     elif kwargs["aggregator"] == "true_better":
+        use_original_ratio = True
         aggregator = true_better
+    elif kwargs["aggregator"] == "prob_mean":
+        aggregator = prob_mean
+        use_original_ratio = False
     else:
         raise ValueError(f"Invalid truth ratio aggregator: {kwargs['aggregator']}")
 
@@ -152,8 +162,14 @@ def true_better(arr):
 
     correct_prob = np.exp(-correct_avg_losses)
     wrong_prob = np.exp(-wrong_avg_losses)
-
-    truth_ratios = wrong_prob / (correct_prob + 1e-10)
+    
+    if use_original_ratio:
+        # Original definition: wrong / correct
+        truth_ratios = wrong_prob / (correct_prob + 1e-10)
+    else:
+        # New definition: correct / (correct + wrong)
+        truth_ratios = correct_prob / (correct_prob + wrong_prob + 1e-10)
+        
     value_by_index = dict(
         zip(correct_indices, [{"score": val} for val in truth_ratios])
     )