[{"data":1,"prerenderedAt":450},["ShallowReactive",2],{"blog-post-/blogs/programbench-code-ceiling":3},{"_path":4,"_dir":5,"_draft":6,"_partial":6,"_locale":7,"title":8,"description":9,"date":10,"image":11,"alt":12,"ogImage":11,"tags":13,"published":18,"body":19,"_type":443,"_id":444,"_source":445,"_file":446,"_stem":447,"_extension":448,"sitemap":449},"/blogs/programbench-code-ceiling","blogs",false,"","AI代码能力的天花板，比你想象的低得多","ProgramBench 基准测试显示所有主流大模型在系统级代码生成任务上得分全部为 0%，这揭示了当前 AI 编程能力的真实边界。","2026-05-07","/blogs-img/programbench_cover_0.jpg","AI代码能力天花板插图",[14,15,16,17],"AI编程","代码生成","ProgramBench","大模型",true,{"type":20,"children":21,"toc":435},"root",[22,30,36,41,50,55,59,66,71,76,81,93,98,103,108,122,125,131,136,141,153,158,169,174,179,184,194,202,205,211,216,228,233,244,249,261,266,271,276,284,287,293,298,303,314,319,324,329,334,345,350,370,375,383,386,392,397,408,413,418,423,426],{"type":23,"tag":24,"props":25,"children":27},"element","h1",{"id":26},"ai代码能力的天花板比你想象的低得多",[28],{"type":29,"value":8},"text",{"type":23,"tag":31,"props":32,"children":33},"p",{},[34],{"type":29,"value":35},"你可能听过各种AI编程有多厉害的叙事——Claude Code刷榜SWE-bench、GPT-5.5 Coding拿下93%准确率、Gemini 2.5 Ultra击败专业程序员……",{"type":23,"tag":31,"props":37,"children":38},{},[39],{"type":29,"value":40},"但今天看到一组数据，让我愣了好几秒。",{"type":23,"tag":31,"props":42,"children":43},{},[44],{"type":23,"tag":45,"props":46,"children":47},"strong",{},[48],{"type":29,"value":49},"所有主流大模型，在ProgramBench上的得分，全部是 0%。",{"type":23,"tag":31,"props":51,"children":52},{},[53],{"type":29,"value":54},"一个字面意义上的 \"零\"。",{"type":23,"tag":56,"props":57,"children":58},"hr",{},[],{"type":23,"tag":60,"props":61,"children":63},"h2",{"id":62},"_1-0是什么概念",[64],{"type":29,"value":65},"1. 0%是什么概念？",{"type":23,"tag":31,"props":67,"children":68},{},[69],{"type":29,"value":70},"先科普一下。ProgramBench 是一个最近刚发布的代码生成基准测试，但它的测试方式跟以往那些妖艳贱货完全不一样。",{"type":23,"tag":31,"props":72,"children":73},{},[74],{"type":29,"value":75},"传统的基准测试，像 HumanEval、SWE-bench 这些，考的是\"局部手术\"能力——给你一个有bug的函数，让AI修一下；给你一个算法题，让AI写个解法。这类任务，AI确实已经很能打了。",{"type":23,"tag":31,"props":77,"children":78},{},[79],{"type":29,"value":80},"但 ProgramBench 上来就是一道大题：",{"type":23,"tag":82,"props":83,"children":84},"blockquote",{},[85],{"type":23,"tag":31,"props":86,"children":87},{},[88],{"type":23,"tag":45,"props":89,"children":90},{},[91],{"type":29,"value":92},"无网络条件下，完整复现 ffmpeg、SQLite、ripgrep 这些复杂软件系统。",{"type":23,"tag":31,"props":94,"children":95},{},[96],{"type":29,"value":97},"不是修一个函数，是从零开始，把一整个有几十万行代码、几十个模块、复杂依赖关系的系统给我重建出来。",{"type":23,"tag":31,"props":99,"children":100},{},[101],{"type":29,"value":102},"考的不是\"会不会写代码\"，考的是\"能不能独立建模一个完整的软件系统\"。",{"type":23,"tag":31,"props":104,"children":105},{},[106],{"type":29,"value":107},"结果，所有模型，齐刷刷，0%。",{"type":23,"tag":31,"props":109,"children":110},{},[111,117],{"type":23,"tag":112,"props":113,"children":116},"img",{"alt":114,"src":115},"传统基准 vs ProgramBench 上","/blogs-img/programbench_1_0.jpg",[],{"type":23,"tag":112,"props":118,"children":121},{"alt":119,"src":120},"传统基准 vs ProgramBench 下","/blogs-img/programbench_1_1.jpg",[],{"type":23,"tag":56,"props":123,"children":124},{},[],{"type":23,"tag":60,"props":126,"children":128},{"id":127},"_2-为什么传统基准一直在注水",[129],{"type":29,"value":130},"2. 为什么传统基准一直在\"注水\"？",{"type":23,"tag":31,"props":132,"children":133},{},[134],{"type":29,"value":135},"你可能会问：那之前的那些高分是怎么来的？",{"type":23,"tag":31,"props":137,"children":138},{},[139],{"type":29,"value":140},"说实话，看到 ProgramBench 这个结果，我第一反应是去翻之前那些基准测试到底在测什么。",{"type":23,"tag":31,"props":142,"children":143},{},[144,146,151],{"type":29,"value":145},"然后我意识到一个问题——",{"type":23,"tag":45,"props":147,"children":148},{},[149],{"type":29,"value":150},"我们一直在用\"局部战斗\"的数据，论证\"全局作战\"的能力",{"type":29,"value":152},"。",{"type":23,"tag":31,"props":154,"children":155},{},[156],{"type":29,"value":157},"就像你不能因为一个人能精准地扣好一颗纽扣，就判断他能独立完成一套西装的定制。",{"type":23,"tag":31,"props":159,"children":160},{},[161,163,168],{"type":29,"value":162},"HumanEval 考的是单函数实现，SWE-bench 考的是补丁级别的修复，这些任务有一个共同特点：",{"type":23,"tag":45,"props":164,"children":165},{},[166],{"type":29,"value":167},"有上下文、有边界、有参照",{"type":29,"value":152},{"type":23,"tag":31,"props":170,"children":171},{},[172],{"type":29,"value":173},"你不是在造一座房子，你是在给一座已经建好的房子换个灯泡。",{"type":23,"tag":31,"props":175,"children":176},{},[177],{"type":29,"value":178},"这当然有价值。但这不是\"系统级软件工程能力\"。",{"type":23,"tag":31,"props":180,"children":181},{},[182],{"type":29,"value":183},"ProgramBench 第一次把测试拉到了这个维度——架构设计、依赖管理、编译构建、多模块协同。这才是真正考验AI\"从零构建\"能力的地方。",{"type":23,"tag":31,"props":185,"children":186},{},[187,189],{"type":29,"value":188},"然后大家发现：",{"type":23,"tag":45,"props":190,"children":191},{},[192],{"type":29,"value":193},"离及格还差得远。",{"type":23,"tag":31,"props":195,"children":196},{},[197],{"type":23,"tag":112,"props":198,"children":201},{"alt":199,"src":200},"换灯泡 vs 建房子","/blogs-img/programbench_2.jpg",[],{"type":23,"tag":56,"props":203,"children":204},{},[],{"type":23,"tag":60,"props":206,"children":208},{"id":207},"_3-0背后的真实差距",[209],{"type":29,"value":210},"3. 0%背后的真实差距",{"type":23,"tag":31,"props":212,"children":213},{},[214],{"type":29,"value":215},"说个我的理解，可能不太准确，但我觉得能说明问题。",{"type":23,"tag":31,"props":217,"children":218},{},[219,221,226],{"type":29,"value":220},"AI编程现在的能力，更像是一个",{"type":23,"tag":45,"props":222,"children":223},{},[224],{"type":29,"value":225},"超级搜索引擎 + 代码补全器",{"type":29,"value":227},"。你给它一个清晰的上下文，它能帮你把缺失的部分补上，而且补得相当准确。",{"type":23,"tag":31,"props":229,"children":230},{},[231],{"type":29,"value":232},"但软件工程真正难的部分是什么？",{"type":23,"tag":31,"props":234,"children":235},{},[236,238,243],{"type":29,"value":237},"是",{"type":23,"tag":45,"props":239,"children":240},{},[241],{"type":29,"value":242},"定义问题本身",{"type":29,"value":152},{"type":23,"tag":31,"props":245,"children":246},{},[247],{"type":29,"value":248},"一个生产级的系统，不是从\"写代码\"开始的，是从\"理解要解决什么问题\"开始的。要跟业务方反复对齐，要理解上下游的约束，要权衡技术债和迭代速度，要考虑可维护性……",{"type":23,"tag":31,"props":250,"children":251},{},[252,254,259],{"type":29,"value":253},"这些东西，目前的AI基本上是无能为力的。不是技术问题，是",{"type":23,"tag":45,"props":255,"children":256},{},[257],{"type":29,"value":258},"信息不对称问题",{"type":29,"value":260},"——AI没有上帝视角，它只能看到你喂给它的上下文。",{"type":23,"tag":31,"props":262,"children":263},{},[264],{"type":29,"value":265},"所以回到 ProgramBench 的 0%——",{"type":23,"tag":31,"props":267,"children":268},{},[269],{"type":29,"value":270},"它测的不是AI会不会写代码，是AI能不能在没有\"参考答案\"的情况下，独立完成一个复杂系统的架构和实现。",{"type":23,"tag":31,"props":272,"children":273},{},[274],{"type":29,"value":275},"这件事，目前没有人做到。",{"type":23,"tag":31,"props":277,"children":278},{},[279],{"type":23,"tag":112,"props":280,"children":283},{"alt":281,"src":282},"AI的边界：超级搜索 vs 定义问题","/blogs-img/programbench_3.jpg",[],{"type":23,"tag":56,"props":285,"children":286},{},[],{"type":23,"tag":60,"props":288,"children":290},{"id":289},"_4-对企业级agent开发意味着什么",[291],{"type":29,"value":292},"4. 对企业级Agent开发意味着什么？",{"type":23,"tag":31,"props":294,"children":295},{},[296],{"type":29,"value":297},"说这么多，你可能要问：这跟我做企业级Agent开发有什么关系？",{"type":23,"tag":31,"props":299,"children":300},{},[301],{"type":29,"value":302},"关系大了。",{"type":23,"tag":31,"props":304,"children":305},{},[306,308,313],{"type":29,"value":307},"我做了不少企业级项目，深刻体会到一件事——",{"type":23,"tag":45,"props":309,"children":310},{},[311],{"type":29,"value":312},"客户买的从来不是\"能跑\"，是\"能交代\"",{"type":29,"value":152},{"type":23,"tag":31,"props":315,"children":316},{},[317],{"type":29,"value":318},"什么意思？",{"type":23,"tag":31,"props":320,"children":321},{},[322],{"type":29,"value":323},"你交付一个Agent给企业，人家要的不是你证明\"AI能干活\"，人家要的是能写进汇报材料、能跟领导汇报、能通过审计的东西。",{"type":23,"tag":31,"props":325,"children":326},{},[327],{"type":29,"value":328},"这背后需要的，是系统设计文档、架构图、接口规范、异常处理流程、监控告警配置……这些\"工程化\"的部分。",{"type":23,"tag":31,"props":330,"children":331},{},[332],{"type":29,"value":333},"而这些东西，目前AI能帮你写的，都是模板化的。真要结合业务场景做定制，还是得人来。",{"type":23,"tag":31,"props":335,"children":336},{},[337,339,344],{"type":29,"value":338},"所以我现在的做法是：",{"type":23,"tag":45,"props":340,"children":341},{},[342],{"type":29,"value":343},"让AI做执行层的东西，把决策层留给人",{"type":29,"value":152},{"type":23,"tag":31,"props":346,"children":347},{},[348],{"type":29,"value":349},"具体来说：",{"type":23,"tag":351,"props":352,"children":353},"ul",{},[354,360,365],{"type":23,"tag":355,"props":356,"children":357},"li",{},[358],{"type":29,"value":359},"标准化流程：AI全链路搞定",{"type":23,"tag":355,"props":361,"children":362},{},[363],{"type":29,"value":364},"非标准化场景：人定义框架，AI填充细节",{"type":23,"tag":355,"props":366,"children":367},{},[368],{"type":29,"value":369},"系统集成：人来做架构设计，AI写具体实现",{"type":23,"tag":31,"props":371,"children":372},{},[373],{"type":29,"value":374},"这样既享受了AI的效率，又保住了交付质量。",{"type":23,"tag":31,"props":376,"children":377},{},[378],{"type":23,"tag":112,"props":379,"children":382},{"alt":380,"src":381},"企业级Agent交付：人机协作分层","/blogs-img/programbench_4.jpg",[],{"type":23,"tag":56,"props":384,"children":385},{},[],{"type":23,"tag":60,"props":387,"children":389},{"id":388},"_5-写在最后",[390],{"type":29,"value":391},"5. 写在最后",{"type":23,"tag":31,"props":393,"children":394},{},[395],{"type":29,"value":396},"ProgramBench 的 0%，不是一个黑点，反而是一个清醒剂。",{"type":23,"tag":31,"props":398,"children":399},{},[400,402,407],{"type":29,"value":401},"它告诉我们：",{"type":23,"tag":45,"props":403,"children":404},{},[405],{"type":29,"value":406},"AI编程被吹过头的那部分，是局部任务能力，不是系统级工程能力",{"type":29,"value":152},{"type":23,"tag":31,"props":409,"children":410},{},[411],{"type":29,"value":412},"对于想靠AI编程创业、做产品的人来说，这反而是个好消息——因为门槛没有想象中那么低，你积累的系统设计能力、领域知识、业务理解，依然是有价值的护城河。",{"type":23,"tag":31,"props":414,"children":415},{},[416],{"type":29,"value":417},"AI会替代的是\"重复劳动\"，不是\"复杂决策\"。",{"type":23,"tag":31,"props":419,"children":420},{},[421],{"type":29,"value":422},"搞清楚这个边界，比盲目追热点重要得多。",{"type":23,"tag":56,"props":424,"children":425},{},[],{"type":23,"tag":31,"props":427,"children":428},{},[429],{"type":23,"tag":430,"props":431,"children":432},"em",{},[433],{"type":29,"value":434},"相关参考：ProgramBench 基准测试发布于 2026年5月7日",{"title":7,"searchDepth":436,"depth":436,"links":437},2,[438,439,440,441,442],{"id":62,"depth":436,"text":65},{"id":127,"depth":436,"text":130},{"id":207,"depth":436,"text":210},{"id":289,"depth":436,"text":292},{"id":388,"depth":436,"text":391},"markdown","content:blogs:programbench-code-ceiling.md","content","blogs/programbench-code-ceiling.md","blogs/programbench-code-ceiling","md",{"loc":4},1778165126992]