[{"data":1,"prerenderedAt":75},["ShallowReactive",2],{"term-b\u002Fbenchmark":3,"related-b\u002Fbenchmark":60},{"id":4,"title":5,"acronym":6,"body":7,"category":40,"description":41,"difficulty":42,"extension":43,"letter":44,"meta":45,"navigation":46,"path":47,"related":48,"seo":54,"sitemap":55,"stem":58,"subcategory":6,"__hash__":59},"terms\u002Fterms\u002Fb\u002Fbenchmark.md","Benchmark",null,{"type":8,"value":9,"toc":33},"minimark",[10,15,19,23,26,30],[11,12,14],"h2",{"id":13},"eli5-the-vibe-check","ELI5 — The Vibe Check",[16,17,18],"p",{},"In AI, a benchmark is a standardized test that measures how good a model is — like the SAT for AI. MMLU tests general knowledge, HumanEval tests coding, HellaSwag tests common sense. Every new model release comes with a table showing benchmark scores, and everyone immediately argues about whether the benchmarks even measure the right things.",[11,20,22],{"id":21},"real-talk","Real Talk",[16,24,25],{},"AI benchmarks are standardized evaluation suites measuring model capabilities across specific dimensions. Key benchmarks include MMLU (knowledge), HumanEval\u002FSWE-bench (coding), GSM8K (math), GPQA (expert reasoning), and ARC (common sense). While essential for comparing models, benchmarks face criticism for data contamination, narrow measurement, and potential gaming.",[11,27,29],{"id":28},"when-youll-hear-this","When You'll Hear This",[16,31,32],{},"\"The new model topped MMLU but real-world performance is what matters.\" \u002F \"Benchmarks are necessary but not sufficient — you need to eval on your specific use case.\"",{"title":34,"searchDepth":35,"depth":35,"links":36},"",2,[37,38,39],{"id":13,"depth":35,"text":14},{"id":21,"depth":35,"text":22},{"id":28,"depth":35,"text":29},"ai","In AI, a benchmark is a standardized test that measures how good a model is — like the SAT for AI.","beginner","md","b",{},true,"\u002Fterms\u002Fb\u002Fbenchmark",[49,50,51,52,53],"Evaluation","LLM","MMLU","Training","Model",{"title":5,"description":41},{"changefreq":56,"priority":57},"weekly",0.7,"terms\u002Fb\u002Fbenchmark","6ozqCFx5TsPS8wA0nlVEY0ra2yCn3rs-7dt2NxxQkY8",[61,65,69,72],{"title":49,"path":62,"acronym":6,"category":40,"difficulty":63,"description":64},"\u002Fterms\u002Fe\u002Fevaluation","intermediate","Evaluation in AI is figuring out if your model actually works — not just on test data, but in the real world.",{"title":50,"path":66,"acronym":67,"category":40,"difficulty":42,"description":68},"\u002Fterms\u002Fl\u002Fllm","Large Language Model","An LLM is a humongous AI that read basically the entire internet and learned to predict what words come next, really really well.",{"title":53,"path":70,"acronym":6,"category":40,"difficulty":42,"description":71},"\u002Fterms\u002Fm\u002Fmodel","A model is the trained AI — the finished product.",{"title":52,"path":73,"acronym":6,"category":40,"difficulty":63,"description":74},"\u002Fterms\u002Ft\u002Ftraining","Training is the long, expensive process where an AI learns from data.",1776518259925]