{
  "status": "draft_methodology_evidence_not_final_ranking",
  "run_id": "20260506T213530Z",
  "providers": ["brave", "serpapi", "tavily"],
  "task_count": 10,
  "qrels_warning": "Metrics use draft qrels from URL patterns/domain hints plus small manual-review annotations; public results require reviewed qrels.",
  "summary": [
    {
      "provider": "brave",
      "queries": 10,
      "mean_ndcg_at_10": 0.61,
      "mean_success_at_3": 0.7,
      "mean_success_at_5": 0.7,
      "mean_mrr": 0.6433,
      "mean_precision_at_5": 0.32,
      "median_latency_ms": 1055
    },
    {
      "provider": "serpapi",
      "queries": 10,
      "mean_ndcg_at_10": 0.6553,
      "mean_success_at_3": 0.7,
      "mean_success_at_5": 0.7,
      "mean_mrr": 0.7,
      "mean_precision_at_5": 0.38,
      "median_latency_ms": 41
    },
    {
      "provider": "tavily",
      "queries": 10,
      "mean_ndcg_at_10": 0.5921,
      "mean_success_at_3": 0.6,
      "mean_success_at_5": 0.7,
      "mean_mrr": 0.5468,
      "mean_precision_at_5": 0.38,
      "median_latency_ms": 623
    }
  ],
  "methodology_implications": [
    "Use objective observations and standard IR metrics before weighted interpretations.",
    "Handle empty results through result_count plus Success@k, MRR, Precision@k, and NDCG@k rather than a separate zero-result score.",
    "Use human or constrained LLM review for ambiguity, current-event, and legal/source-diversity tasks.",
    "Blind pooled URLs and build reviewed graded qrels before publishing rankings."
  ]
}
