Example: Evaluating from Saved Results

If you've already run inference and saved predictions (pickle, JSON, or COCO format), you can evaluate calibration without re-running the model.

From a Pickle File (DETR-style)

This is the format used in our paper's experiments:

import pickle
import numpy as np
from scipy.special import expit  # sigmoid

import uq_detr
from uq_detr import Detections, GroundTruth, box_convert, select

# Load saved inference results
with open("infer_results.pkl", "rb") as f:
    raw_data = pickle.load(f)

# Convert to uq_detr format
all_queries = []
ground_truths = []

for sample in raw_data:
    h, w = sample["orig_size"]
    scores = expit(sample["pred_logits"])   # sigmoid activation

    all_queries.append(Detections.from_cxcywh(
        sample["pred_boxes"], scores, image_size=(h, w)
    ))

    if len(sample["boxes"]) > 0:
        gt = GroundTruth.from_cxcywh(
            sample["boxes"], sample["labels"], image_size=(h, w)
        )
    else:
        gt = GroundTruth(
            boxes=np.zeros((0, 4)), labels=np.zeros(0, dtype=int)
        )
    ground_truths.append(gt)

# Evaluate
filtered = [select(q, method="threshold", param=0.3) for q in all_queries]
print("OCE:   ", uq_detr.oce(filtered, ground_truths).score)
print("D-ECE: ", uq_detr.dece(filtered, ground_truths, tp_criterion="independent").score)
print("LA-ECE:", uq_detr.laece(filtered, ground_truths, tp_criterion="independent").score)

From COCO JSON Format

If you have predictions in standard COCO results format:

import json
import numpy as np
from pycocotools.coco import COCO

import uq_detr
from uq_detr import Detections, GroundTruth

# Load COCO annotations and predictions
coco_gt = COCO("instances_val2017.json")
with open("predictions.json") as f:
    coco_preds = json.load(f)

# Group predictions by image
from collections import defaultdict
preds_by_image = defaultdict(list)
for p in coco_preds:
    preds_by_image[p["image_id"]].append(p)

# Convert
all_detections = []
all_ground_truths = []

for img_id in coco_gt.getImgIds():
    # Ground truth
    ann_ids = coco_gt.getAnnIds(imgIds=img_id)
    anns = coco_gt.loadAnns(ann_ids)
    gt_boxes = np.array([a["bbox"] for a in anns])  # xywh
    gt_labels = np.array([a["category_id"] for a in anns])

    if len(gt_boxes) > 0:
        gt = GroundTruth.from_xywh(gt_boxes, gt_labels)
    else:
        gt = GroundTruth(boxes=np.zeros((0, 4)), labels=np.zeros(0, dtype=int))
    all_ground_truths.append(gt)

    # Predictions (COCO result format: xywh boxes, scalar scores)
    preds = preds_by_image.get(img_id, [])
    if preds:
        pred_boxes = np.array([p["bbox"] for p in preds])
        pred_scores = np.array([p["score"] for p in preds])
        pred_labels = np.array([p["category_id"] for p in preds])
        det = Detections.from_xywh(pred_boxes, pred_scores, labels=pred_labels)
    else:
        det = Detections(
            boxes=np.zeros((0, 4)), scores=np.zeros(0), labels=np.zeros(0, dtype=int)
        )
    all_detections.append(det)

# Evaluate
print("OCE:   ", uq_detr.oce(all_detections, all_ground_truths).score)
print("D-ECE: ", uq_detr.dece(all_detections, all_ground_truths, tp_criterion="greedy").score)

Note

COCO JSON results only contain max-confidence scores, not full class distributions. OCE will use the binary approximation.