<?xml version="1.0" encoding="UTF-8"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
  <channel>
    <title>AI Evals</title>
    <link>https://www.aievals.co</link>
    <description>Recent updates from aievals.co</description>
    <atom:link href="https://www.aievals.co/rss.xml" rel="self" type="application/rss+xml" />
    <language>en-us</language>
    <lastBuildDate>Fri, 29 May 2026 16:19:28 GMT</lastBuildDate>
    <item>
      <title>Why evals matter</title>
      <link>https://www.aievals.co/learn/foundations/why-evals-matter</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/foundations/why-evals-matter</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Foundations</category>
      <description>If you ship AI, evals are the moat. The case in one page.</description>
    </item>
    <item>
      <title>The eval maturity model</title>
      <link>https://www.aievals.co/learn/foundations/eval-maturity-model</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/foundations/eval-maturity-model</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Foundations</category>
      <description>Five stages from vibe-checking to a continuous quality program. Where are you?</description>
    </item>
    <item>
      <title>AI development as scientific method</title>
      <link>https://www.aievals.co/learn/foundations/scientific-method-for-ai</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/foundations/scientific-method-for-ai</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Foundations</category>
      <description>The observe, hypothesize, experiment, measure, iterate loop mapped onto an AI product team.</description>
    </item>
    <item>
      <title>Why LLM evals are hard</title>
      <link>https://www.aievals.co/learn/foundations/non-determinism-and-subjectivity</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/foundations/non-determinism-and-subjectivity</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Foundations</category>
      <description>Non-determinism, subjectivity, prompt sensitivity, and the reasons classical ML eval frameworks do not transfer.</description>
    </item>
    <item>
      <title>Eval vs monitoring vs guardrails</title>
      <link>https://www.aievals.co/learn/foundations/eval-vs-monitoring-vs-guardrails</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/foundations/eval-vs-monitoring-vs-guardrails</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Foundations</category>
      <description>Three concepts that share traces but do different jobs, with the architectural placement of each in a request lifecycle.</description>
    </item>
    <item>
      <title>The 60-80% rule</title>
      <link>https://www.aievals.co/learn/error-analysis/the-60-80-rule</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/error-analysis/the-60-80-rule</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Error Analysis</category>
      <description>Three failure modes usually account for most of your bugs. Find them, fix them, repeat.</description>
    </item>
    <item>
      <title>Open coding for AI traces</title>
      <link>https://www.aievals.co/learn/error-analysis/open-coding-workflow</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/error-analysis/open-coding-workflow</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Error Analysis</category>
      <description>Read traces, write notes, cluster notes into failure modes, prioritize fixes. The three-stage process and the LLM-assisted parts.</description>
    </item>
    <item>
      <title>Dimensional sampling</title>
      <link>https://www.aievals.co/learn/error-analysis/dimensional-sampling</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/error-analysis/dimensional-sampling</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Error Analysis</category>
      <description>Synthetic input construction across features, scenarios, and personas to cover the failure surface, not the demo path.</description>
    </item>
    <item>
      <title>Failure-mode taxonomies</title>
      <link>https://www.aievals.co/learn/error-analysis/failure-mode-taxonomies</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/error-analysis/failure-mode-taxonomies</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Error Analysis</category>
      <description>Bottom-up coding beats top-down templates, and why generic labels like hallucination mislead.</description>
    </item>
    <item>
      <title>Build your trace viewer in an afternoon</title>
      <link>https://www.aievals.co/learn/error-analysis/the-data-viewer</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/error-analysis/the-data-viewer</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Error Analysis</category>
      <description>The single highest-return tool for an eval program. Streamlit and Next.js variants, with the minimum feature set.</description>
    </item>
    <item>
      <title>Case study: NurtureBoss</title>
      <link>https://www.aievals.co/learn/error-analysis/case-study-nurtureboss</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/error-analysis/case-study-nurtureboss</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Error Analysis</category>
      <description>One team&apos;s path from 66 percent date errors to 5 percent in a single error-analysis cycle.</description>
    </item>
    <item>
      <title>LLM-as-Judge: the practical introduction</title>
      <link>https://www.aievals.co/learn/llm-as-judge/introduction</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/introduction</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>What an LLM judge is, when to use one, when not to, and the three pitfalls that show up on every team&apos;s first attempt.</description>
    </item>
    <item>
      <title>The four-part judge prompt</title>
      <link>https://www.aievals.co/learn/llm-as-judge/judge-prompt-structure</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/judge-prompt-structure</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>A copyable template: role and definitions, criteria, few-shot from your train set, output format. Plus the runnable invocation.</description>
    </item>
    <item>
      <title>Calibrating your judge against humans</title>
      <link>https://www.aievals.co/learn/llm-as-judge/calibration-to-humans</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/calibration-to-humans</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>TPR and TNR over agreement percent. Train, dev, test splits. The 100-example minimum. Iterate the prompt until both rates clear 0.90.</description>
    </item>
    <item>
      <title>Judge biases you must defuse</title>
      <link>https://www.aievals.co/learn/llm-as-judge/biases-and-mitigations</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/biases-and-mitigations</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>Position, verbosity, self-preference, format, sycophancy. Each with the empirical evidence and the mitigation you bake into the harness.</description>
    </item>
    <item>
      <title>Where LLM judges fail</title>
      <link>https://www.aievals.co/learn/llm-as-judge/judgebench-honest-take</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/judgebench-honest-take</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>JudgeBench: GPT-4o is roughly random on hard correctness judgments. The implications for your pipeline are concrete, not theoretical.</description>
    </item>
    <item>
      <title>Pairwise vs direct scoring</title>
      <link>https://www.aievals.co/learn/llm-as-judge/pairwise-vs-direct</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/pairwise-vs-direct</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>When to ask the judge for a side-by-side ranking and when to ask for a single binary verdict.</description>
    </item>
    <item>
      <title>Distilled judges</title>
      <link>https://www.aievals.co/learn/llm-as-judge/distilled-judges</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/llm-as-judge/distilled-judges</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>LLM-as-Judge</category>
      <description>Why distillation matters in production, which shipped judges to know about, and the inline plus calibration architecture that holds up.</description>
    </item>
    <item>
      <title>Constructing your eval dataset</title>
      <link>https://www.aievals.co/learn/datasets/construction</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/datasets/construction</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Datasets &amp; Golden Sets</category>
      <description>How to build the first hundred labeled examples, then grow them past five hundred without losing signal.</description>
    </item>
    <item>
      <title>Synthetic vs human-authored examples</title>
      <link>https://www.aievals.co/learn/datasets/synthetic-vs-human</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/datasets/synthetic-vs-human</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Datasets &amp; Golden Sets</category>
      <description>Where each shape of data buys signal, where each one leaks, and how to mix them without poisoning the set.</description>
    </item>
    <item>
      <title>Versioning your eval set</title>
      <link>https://www.aievals.co/learn/datasets/versioning-lineage</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/datasets/versioning-lineage</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Datasets &amp; Golden Sets</category>
      <description>A content-hashed eval set pinned to every result is a contract with your past self.</description>
    </item>
    <item>
      <title>Building regression sets</title>
      <link>https://www.aievals.co/learn/datasets/regression-sets</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/datasets/regression-sets</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Datasets &amp; Golden Sets</category>
      <description>The set whose job is to fail loudly the next time you reintroduce a bug you already fixed.</description>
    </item>
    <item>
      <title>Dimensional coverage: features, scenarios, personas</title>
      <link>https://www.aievals.co/learn/datasets/dimensional-coverage</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/datasets/dimensional-coverage</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Datasets &amp; Golden Sets</category>
      <description>A three-axis grid that tells you which cells of the failure surface your set actually probes.</description>
    </item>
    <item>
      <title>Where to keep your eval datasets</title>
      <link>https://www.aievals.co/learn/datasets/dataset-platforms</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/datasets/dataset-platforms</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Datasets &amp; Golden Sets</category>
      <description>Langfuse, Phoenix, Braintrust, LangSmith, or a JSONL in git. The choice that actually matters.</description>
    </item>
    <item>
      <title>Eval RAG in two layers</title>
      <link>https://www.aievals.co/learn/rag-evals/retrieval-vs-generation</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/retrieval-vs-generation</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>Separate retrieval and generation or you cannot fix what breaks.</description>
    </item>
    <item>
      <title>Retrieval metrics: Recall@K, MRR, NDCG, hit-rate</title>
      <link>https://www.aievals.co/learn/rag-evals/retrieval-metrics</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/retrieval-metrics</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>The IR metrics that distinguish a retriever that found the right chunk from one that did not.</description>
    </item>
    <item>
      <title>Faithfulness and atomic facts</title>
      <link>https://www.aievals.co/learn/rag-evals/generation-faithfulness</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/generation-faithfulness</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>Decompose the answer into atomic claims, judge each claim binary, and stop measuring hallucination as a single yes/no.</description>
    </item>
    <item>
      <title>Context precision and recall</title>
      <link>https://www.aievals.co/learn/rag-evals/context-precision-recall</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/context-precision-recall</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>The Ragas pair that tells you whether the retriever found the right chunks and whether they were the only chunks worth showing the model.</description>
    </item>
    <item>
      <title>Synthetic query generation</title>
      <link>https://www.aievals.co/learn/rag-evals/synthetic-query-generation</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/synthetic-query-generation</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>Bootstrap a labeled RAG eval set from your own documents, without inventing ground truth.</description>
    </item>
    <item>
      <title>Long-context RAG and RULER</title>
      <link>https://www.aievals.co/learn/rag-evals/long-context-rag</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/long-context-rag</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>Why advertised context lengths exceed effective context lengths, and the benchmarks that expose the gap.</description>
    </item>
    <item>
      <title>Non-English RAG eval</title>
      <link>https://www.aievals.co/learn/rag-evals/non-english-rag</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/rag-evals/non-english-rag</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>RAG Evals</category>
      <description>What to measure when your corpus and your users do not share a language with most of the literature.</description>
    </item>
    <item>
      <title>Trajectory-level evaluation</title>
      <link>https://www.aievals.co/learn/agentic-evals/trajectory-vs-outcome</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/trajectory-vs-outcome</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Outcome-only scoring ranks agents in the wrong order. Score the trajectory and the result.</description>
    </item>
    <item>
      <title>Tool-call evaluation: AST, schema, execution</title>
      <link>https://www.aievals.co/learn/agentic-evals/tool-call-evaluation</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/tool-call-evaluation</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Three levels of tool-call evaluation: schema conformance, semantic AST equivalence, and execution-grounded outcomes. Use all three or you measure the wrong thing.</description>
    </item>
    <item>
      <title>Sub-goal decomposition: plan graphs as eval objects</title>
      <link>https://www.aievals.co/learn/agentic-evals/sub-goal-decomposition</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/sub-goal-decomposition</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Real agent tasks are graphs. Evaluating only the leaf hides 4-of-5 sub-goal progress. Sub-goal completion is the planning metric that surfaces it.</description>
    </item>
    <item>
      <title>Pass^k: the metric that catches inconsistent agents</title>
      <link>https://www.aievals.co/learn/agentic-evals/pass-k-and-consistency</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/pass-k-and-consistency</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Pass@1 hides catastrophic inconsistency. Report pass^k or you do not know how often the agent will repeatedly succeed.</description>
    </item>
    <item>
      <title>Agent-as-a-judge: the trajectory-scoring pattern</title>
      <link>https://www.aievals.co/learn/agentic-evals/agent-as-judge</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/agent-as-judge</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Single-LLM judges lose the thread on long trajectories. An agent judge with its own tools can walk the steps and grade them independently.</description>
    </item>
    <item>
      <title>Reward hacking detection</title>
      <link>https://www.aievals.co/learn/agentic-evals/reward-hacking</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/reward-hacking</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Once an objective is optimized, it gets gamed. Reward hacking is empirically common; eval for it explicitly.</description>
    </item>
    <item>
      <title>Environmental drift: robustness to tool and API change</title>
      <link>https://www.aievals.co/learn/agentic-evals/environmental-drift</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/environmental-drift</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Most agent benchmarks freeze the environment. Production does not. Inject latency, schema shifts, and transient errors. Score the recovery, not the happy path.</description>
    </item>
    <item>
      <title>Regression suites for agent loops</title>
      <link>https://www.aievals.co/learn/agentic-evals/regression-suites</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/agentic-evals/regression-suites</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Agentic Evals</category>
      <description>Trajectory regression is harder than answer regression. Replay 50 prod trajectories on every change and diff outcome, step count, and tool-call sequence.</description>
    </item>
    <item>
      <title>Online vs offline: where each fires</title>
      <link>https://www.aievals.co/learn/production/online-vs-offline</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/online-vs-offline</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>Offline evals run on a frozen dataset for every change. Online evals sample real traffic continuously. They answer different questions and both belong in production.</description>
    </item>
    <item>
      <title>Trace schema: what to log</title>
      <link>https://www.aievals.co/learn/production/trace-schema</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/trace-schema</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>The trace is the unit of analysis for every eval program. This is the schema, the fields that matter, and the storage shape that makes downstream queries cheap.</description>
    </item>
    <item>
      <title>Sampling strategies for production traces</title>
      <link>https://www.aievals.co/learn/production/sampling-strategies</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/sampling-strategies</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>Uniform sampling is the default and the wrong default for tail slices. Stratify on the dimensions you care about, oversample failures, and reweight.</description>
    </item>
    <item>
      <title>Drift detection for production AI</title>
      <link>https://www.aievals.co/learn/production/drift-detection</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/drift-detection</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>Track input and output distributions over time. Page on changes that matter; ignore the ones that do not. The hardest part is deciding which is which.</description>
    </item>
    <item>
      <title>Cost and latency belong in the scorecard</title>
      <link>https://www.aievals.co/learn/production/cost-latency-eval</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/cost-latency-eval</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>A model that wins quality and triples p95 latency is not a win. Put quality, cost, and latency on one scorecard and pick on the Pareto frontier.</description>
    </item>
    <item>
      <title>Wiring evals into CI</title>
      <link>https://www.aievals.co/learn/production/ci-integration</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/ci-integration</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>Tier the suite into smoke, full, and nightly. Cap cost. Shard. Publish to the PR. Make the merge gate boring and the merge decision data-driven.</description>
    </item>
    <item>
      <title>Release gates from eval results</title>
      <link>https://www.aievals.co/learn/production/release-gates</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/release-gates</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>A release gate is a contract between the team and the user. Define the bands, automate the comparison, and hold the line when a regression candidate wants out.</description>
    </item>
    <item>
      <title>Incident response for AI products</title>
      <link>https://www.aievals.co/learn/production/incident-response</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/production/incident-response</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Production &amp; Observability</category>
      <description>An AI postmortem looks different from a service postmortem. The root cause is rarely a line of code. The actions are usually eval-set changes.</description>
    </item>
    <item>
      <title>Confidence intervals: Wilson and Clopper-Pearson</title>
      <link>https://www.aievals.co/learn/statistics/confidence-intervals</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/confidence-intervals</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>Wald is wrong. Wilson is the practical default. Clopper-Pearson is the conservative exact. With code and the case for why N=10 evals tell you almost nothing.</description>
    </item>
    <item>
      <title>Bootstrap and paired tests for noisy eval metrics</title>
      <link>https://www.aievals.co/learn/statistics/bootstrap-and-paired-tests</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/bootstrap-and-paired-tests</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>When the metric is not a clean Bernoulli, bootstrap. When the systems share inputs, pair. Both close the gap between point estimates and actionable decisions.</description>
    </item>
    <item>
      <title>Statistical power: how big does the eval set need to be</title>
      <link>https://www.aievals.co/learn/statistics/statistical-power</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/statistical-power</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>Power is the probability of detecting the delta you care about. Pick the delta first, derive the N, then commit. Reversing the order produces evals nobody can act on.</description>
    </item>
    <item>
      <title>Clustered standard errors: when CIs lie</title>
      <link>https://www.aievals.co/learn/statistics/clustered-standard-errors</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/clustered-standard-errors</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>When eval questions cluster (multiple questions per topic, multiple turns per session), naive standard errors are three times too small. The cluster-aware variance formula and code, with the...</description>
    </item>
    <item>
      <title>Inter-rater agreement: Cohen&apos;s kappa and Krippendorff&apos;s alpha</title>
      <link>https://www.aievals.co/learn/statistics/inter-rater-agreement</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/inter-rater-agreement</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>Two qualified humans disagree on your rubric. That is information about the rubric, not about the raters. Measure agreement before you trust labels.</description>
    </item>
    <item>
      <title>Multiple comparisons: Bonferroni vs BH-FDR</title>
      <link>https://www.aievals.co/learn/statistics/multiple-comparisons</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/multiple-comparisons</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>Run twenty tests at alpha 0.05 and you expect one false positive. Correct for it. Bonferroni for strict family-wise control; Benjamini-Hochberg for false-discovery control.</description>
    </item>
    <item>
      <title>Effect size: Cohen&apos;s d and absolute deltas</title>
      <link>https://www.aievals.co/learn/statistics/effect-size</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/statistics/effect-size</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Statistics &amp; Rigor</category>
      <description>A statistically significant 0.3-point win on a 0-100 rubric is not worth shipping. Report the magnitude, not just the p-value.</description>
    </item>
    <item>
      <title>Walking the OWASP Top 10 for LLM apps</title>
      <link>https://www.aievals.co/learn/safety-and-red-team/owasp-llm-top-10</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/safety-and-red-team/owasp-llm-top-10</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Safety &amp; Red-Team</category>
      <description>The ten failure categories every LLM application owner should be probing, with the smallest useful test for each.</description>
    </item>
    <item>
      <title>Designing a red-team program</title>
      <link>https://www.aievals.co/learn/safety-and-red-team/red-team-program-design</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/safety-and-red-team/red-team-program-design</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Safety &amp; Red-Team</category>
      <description>The operating model for an LLM red team: frequency, staffing, intake, scoring, and where it sits relative to automated regression.</description>
    </item>
    <item>
      <title>HarmBench in practice</title>
      <link>https://www.aievals.co/learn/safety-and-red-team/harmbench-walkthrough</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/safety-and-red-team/harmbench-walkthrough</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Safety &amp; Red-Team</category>
      <description>A code walk through the HarmBench corpus, the StrongREJECT scoring classifier, and how to read the leaderboard without overfitting to it.</description>
    </item>
    <item>
      <title>AILuminate in practice</title>
      <link>https://www.aievals.co/learn/safety-and-red-team/ailuminate-walkthrough</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/safety-and-red-team/ailuminate-walkthrough</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Safety &amp; Red-Team</category>
      <description>MLCommons&apos; standardized safety benchmark, the twelve hazard categories, and how to read its grade scale alongside HarmBench.</description>
    </item>
    <item>
      <title>Over-refusal and the balance question</title>
      <link>https://www.aievals.co/learn/safety-and-red-team/over-refusal-and-balance</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/safety-and-red-team/over-refusal-and-balance</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Safety &amp; Red-Team</category>
      <description>Why refusal rate alone is a misleading metric, the XSTest pattern for paired benign cases, and how to grade refusal against necessity.</description>
    </item>
    <item>
      <title>Jailbreaks and defenses</title>
      <link>https://www.aievals.co/learn/safety-and-red-team/jailbreaks-and-defenses</link>
      <guid isPermaLink="true">https://www.aievals.co/learn/safety-and-red-team/jailbreaks-and-defenses</guid>
      <pubDate>Fri, 29 May 2026 00:00:00 GMT</pubDate>
      <category>Safety &amp; Red-Team</category>
      <description>GCG-style adversarial suffixes, the empirical floor on defense success, and what your guardrail stack should actually do.</description>
    </item>
  </channel>
</rss>
