[{"data":1,"prerenderedAt":738},["ShallowReactive",2],{"/2026/harness-engineering":3,"surround-/2026/harness-engineering":726},{"id":4,"title":5,"body":6,"categories":676,"comment":678,"date":679,"description":680,"donation":678,"draft":681,"extension":682,"image":683,"meta":684,"navigation":678,"path":686,"permalink":683,"postfooter":678,"published":683,"readingTime":687,"recommend":683,"references":692,"seo":714,"sitemap":715,"stem":716,"tags":717,"type":724,"updated":679,"__hash__":725},"content/posts/2026/harness-engineering.md","Harness Engineering",{"type":7,"value":8,"toc":650},"minimark",[9,18,32,35,43,46,51,54,109,116,120,123,131,134,144,147,154,157,167,170,176,179,183,186,191,194,208,211,215,218,235,238,242,245,266,273,290,293,297,304,307,314,317,337,341,344,347,350,364,371,374,378,381,384,401,408,412,415,422,426,429,472,475,481,484,488,491,495,498,512,515,519,522,542,545,549,552,566,570,576,590,594,597,600,611,614,617,628,631,634,637,642,645],[10,11,14],"alert",{"title":12,"type":13},"说明","info",[15,16,17],"p",{},"本文初稿由 AI 生成，并经人工整理、校对与修订。",[10,19,21],{"title":20,"type":13},"本文怎么讲",[15,22,23,27,28,31],{},[24,25,26],"strong",{},"Harness Engineering 不是“把 Prompt 写得更长”","，而是围绕 Agent 的运行环境、工具编排、状态管理、验证反馈和安全边界做系统设计。\n我会沿着视频里那条很顺的主线来讲：",[24,29,30],{},"演进、构成、实践","。这个框架最大的好处是，不会把 Harness 讲成一个玄学新词，而是把它放回 Agent 工程的真实链路里。",[15,33,34],{},"我这段时间反复看了这支视频，也对照了 OpenAI、Anthropic 在 2025-2026 年公开的实践，最大的感受是：",[36,37,38],"blockquote",{},[15,39,40],{},[24,41,42],{},"AI 工程的重心，正在从“怎么写一句好 prompt”，移动到“怎么把 agent 放进一个能稳定工作的运行系统里”。",[15,44,45],{},"我踩过的坑很典型：多步任务里提前宣布完成、工具失败后乱跳步骤、上下文一长就丢目标。问题不在“模型不够强”，而在“运行框架太薄”。",[47,48,50],"h2",{"id":49},"从-prompt-到-context再到-harness","从 Prompt 到 Context，再到 Harness",[15,52,53],{},"先把三个阶段拆开看，会更清楚：",[55,56,57,73],"table",{},[58,59,60],"thead",{},[61,62,63,67,70],"tr",{},[64,65,66],"th",{},"阶段",[64,68,69],{},"核心关注点",[64,71,72],{},"典型问题",[74,75,76,88,99],"tbody",{},[61,77,78,82,85],{},[79,80,81],"td",{},"Prompt Engineering",[79,83,84],{},"指令怎么写",[79,86,87],{},"怎么让模型按要求回答",[61,89,90,93,96],{},[79,91,92],{},"Context Engineering",[79,94,95],{},"上下文怎么组织",[79,97,98],{},"什么信息该进窗口，怎么持续维护",[61,100,101,103,106],{},[79,102,5],{},[79,104,105],{},"Agent 怎么运行",[79,107,108],{},"怎么让模型在多步任务里稳定做事",[15,110,111,112,115],{},"Anthropic 讲 ",[113,114,92],"code",{"code":92}," 时，已经把问题从“写一句话”扩展到“维护整组上下文状态”。到 2026 年，OpenAI 和 Anthropic 又进一步强调：拉开 Agent 差距的，通常是整套 harness，而不只是 prompt 或 context。",[47,117,119],{"id":118},"harness-到底是什么","Harness 到底是什么",[15,121,122],{},"我更倾向于用 Anthropic 那句定义：",[36,124,125],{},[15,126,127,130],{},[113,128,129],{"code":129},"agent harness"," 是让模型作为 agent 运作的系统。它负责处理输入、编排工具调用，并返回结果。",[15,132,133],{},"这句话先排除了两个常见误解：",[135,136,137,141],"ul",{},[138,139,140],"li",{},"Harness 不是模型本身。",[138,142,143],{},"Harness 也不只是 prompt 模板或工具清单。",[15,145,146],{},"我更愿意把它翻译成一句工程话：",[36,148,149],{},[15,150,151],{},[24,152,153],{},"Harness 是包在模型外面的那层执行系统，它把“会推理的模型”变成“能持续完成任务的 agent”。",[15,155,156],{},"举个任务：",[158,159,165],"pre",{"className":160,"code":162,"language":163,"meta":164},[161],"language-text","帮我定位登录页报错，修复它，跑测试，最后给出修改说明。\n","text","",[113,166,162],{"__ignoreMap":164},[15,168,169],{},"模型负责推理，harness 负责把推理变成连续动作：",[158,171,174],{"className":172,"code":173,"language":163,"meta":164},[161],"接收任务\n  -> 提供可用工具\n  -> 决定是否调用工具\n  -> 执行工具\n  -> 回填结果\n  -> 保留中间状态\n  -> 触发自检/测试/评审\n  -> 在满足退出条件后结束\n",[113,175,173],{"__ignoreMap":164},[15,177,178],{},"没有这一层，模型更像“会说话的接口”，而不是“能持续干活的人”。",[47,180,182],{"id":181},"一个-harness-通常包含什么","一个 Harness 通常包含什么",[15,184,185],{},"视频里“构成”这段很关键。Harness Engineering 本质上就是把容易被忽略的工程件补齐。",[187,188,190],"h3",{"id":189},"_1-任务入口","1. 任务入口",[15,192,193],{},"入口不是“把需求贴给模型”，至少要写清楚：",[135,195,196,199,202,205],{},[138,197,198],{},"system prompt 或角色设定",[138,200,201],{},"初始约束",[138,203,204],{},"成功标准",[138,206,207],{},"可用资源和权限",[15,209,210],{},"我自己的经验是，入口模糊时，后面的 loop 越复杂，跑偏越稳定。",[187,212,214],{"id":213},"_2-工具层","2. 工具层",[15,216,217],{},"工具层决定 agent 能不能动手：",[135,219,220,223,226,229,232],{},[138,221,222],{},"读写文件",[138,224,225],{},"执行命令",[138,227,228],{},"浏览网页",[138,230,231],{},"调数据库",[138,233,234],{},"调 MCP Server 或浏览器自动化",[15,236,237],{},"Prompt 影响“怎么说”，Tool 决定“能不能做”。Harness 的职责，是把能力暴露得安全、稳定、低歧义。",[187,239,241],{"id":240},"_3-agent-loop","3. Agent Loop",[15,243,244],{},"Agent Loop 是心脏，最基础就是一个迭代循环：",[246,247,248,251,254,257,260,263],"ol",{},[138,249,250],{},"读取当前任务和状态",[138,252,253],{},"让模型判断下一步",[138,255,256],{},"如果需要，调用工具",[138,258,259],{},"读取工具结果",[138,261,262],{},"继续推理",[138,264,265],{},"直到结束",[15,267,268,269,272],{},"我第一次做时也以为一个 ",[113,270,271],{"code":271},"while"," 就够，后来发现难点在策略：",[135,274,275,278,281,284,287],{},[138,276,277],{},"什么时候该继续，什么时候该停",[138,279,280],{},"工具失败后怎么恢复",[138,282,283],{},"任务做一半时怎么避免“宣布胜利”",[138,285,286],{},"上下文太长时怎么压缩或重开 session",[138,288,289],{},"多轮执行时怎么做交接",[15,291,292],{},"这些问题不会靠“换更强模型”自动消失。",[187,294,296],{"id":295},"_4-状态与上下文管理","4. 状态与上下文管理",[15,298,299,300,303],{},"状态管理是最容易被低估的一块。Anthropic 提到的 ",[113,301,302],{"code":302},"initializer agent -> coding agent"," 分工很值得借鉴：先搭环境和进度骨架，再做增量推进，并强制留下交接记录。",[15,305,306],{},"这背后的核心很朴素：",[36,308,309],{},[15,310,311],{},[24,312,313],{},"长任务做不稳，很多时候不是模型不会，而是 session 之间没有可靠交接。",[15,315,316],{},"所以我现在会维护这些资产：",[135,318,319,322,325,328,331,334],{},[138,320,321],{},"progress 文件",[138,323,324],{},"feature checklist",[138,326,327],{},"中间产物",[138,329,330],{},"git 提交或阶段快照",[138,332,333],{},"结构化计划",[138,335,336],{},"压缩后的上下文摘要",[187,338,340],{"id":339},"_5-验证与反馈回路","5. 验证与反馈回路",[15,342,343],{},"我现在判断一个系统是不是“真 Agent”的标准很简单：它有没有验证闭环。",[15,345,346],{},"只会调工具但不会验证，通常只是自动化脚本。",[15,348,349],{},"OpenAI 那篇文章给我的启发是，不是让模型“写更多”，而是让它“看见更多反馈”：",[135,351,352,355,358,361],{},[138,353,354],{},"UI 可被直接驱动",[138,356,357],{},"日志、指标、trace 可被查询",[138,359,360],{},"仓库文档可被检索",[138,362,363],{},"review 和反馈可进入下一轮执行",[15,365,366,367,370],{},"Anthropic 讲的 ",[113,368,369],{"code":369},"planner -> generator -> evaluator"," 也一样，重点在 evaluator: 把“说不清的质量要求”变成可执行反馈。",[15,372,373],{},"团队差距往往不在工具数量，而在闭环质量。",[187,375,377],{"id":376},"_6-权限安全与退出条件","6. 权限、安全与退出条件",[15,379,380],{},"Agent 一旦能执行命令、改文件、发请求，就必须有边界。",[15,382,383],{},"Harness 往往需要明确：",[135,385,386,389,392,395,398],{},[138,387,388],{},"哪些目录可读写",[138,390,391],{},"哪些命令可执行",[138,393,394],{},"什么时候需要人工确认",[138,396,397],{},"什么情况下直接停止",[138,399,400],{},"最终产出要满足哪些检查",[15,402,403,404,407],{},"所以 Anthropic 在 ",[113,405,406],{"code":406},"Managed Agents"," 里把 harness、runtime、sandbox 放在一起讲，我觉得很合理。",[47,409,411],{"id":410},"为什么-harness-engineering-现在突然这么重要","为什么 Harness Engineering 现在突然这么重要",[15,413,414],{},"因为行业目标已经从“回答问题”变成“完成任务”。瓶颈也随之变化：不再只看 prompt 和上下文，而是看 Agent 在真实环境里能不能持续做对动作。",[15,416,417,418,421],{},"OpenAI 的观点我很认同：工程师重心会越来越偏向",[24,419,420],{},"设计环境、表达意图、建立反馈回路","。",[47,423,425],{"id":424},"它和-promptcontexttoolmcp-的关系","它和 Prompt、Context、Tool、MCP 的关系",[15,427,428],{},"这几个概念容易混。看这张表就清楚了：",[55,430,431,441],{},[58,432,433],{},[61,434,435,438],{},[64,436,437],{},"概念",[64,439,440],{},"解决什么问题",[74,442,443,450,457,465],{},[61,444,445,447],{},[79,446,81],{},[79,448,449],{},"怎么把话说清楚",[61,451,452,454],{},[79,453,92],{},[79,455,456],{},"怎么把信息喂对",[61,458,459,462],{},[79,460,461],{},"Tool / MCP",[79,463,464],{},"怎么让模型接触外部能力",[61,466,467,469],{},[79,468,5],{},[79,470,471],{},"怎么把这些东西编排成一个可工作的系统",[15,473,474],{},"Harness 不是替代 Prompt 或 Context，而是更外层的编排层：",[158,476,479],{"className":477,"code":478,"language":163,"meta":164},[161],"意图通过 Prompt 表达\n信息通过 Context 维护\n能力通过 Tool / MCP 边界\n整个运行逻辑通过 Harness 协调\n",[113,480,478],{"__ignoreMap":164},[15,482,483],{},"换个角度说，Prompt 搞定\"说什么\"，Context 搞定\"看什么\"，Harness 才搞定\"怎么干\"。",[47,485,487],{"id":486},"结合视频普通开发者最该学什么","结合视频，普通开发者最该学什么",[15,489,490],{},"我更关心“看懂后怎么落地”。下面这 5 件事可以马上做。",[187,492,494],{"id":493},"_1-不要只写-prompt要写成功标准","1. 不要只写 prompt，要写成功标准",[15,496,497],{},"我现在下任务会先写退出条件：",[135,499,500,503,506,509],{},[138,501,502],{},"复现路径是什么",[138,504,505],{},"修完后要通过哪些测试",[138,507,508],{},"页面上应该看到什么结果",[138,510,511],{},"不允许改动哪些模块",[15,513,514],{},"这样 agent 知道什么时候该停。",[187,516,518],{"id":517},"_2-给-agent-留结构化工位","2. 给 Agent 留结构化工位",[15,520,521],{},"任务一旦超过 30 分钟，我就默认要有“工位文件”：",[135,523,524,529,534,539],{},[138,525,526],{},[113,527,528],{"code":528},"PLAN.md",[138,530,531],{},[113,532,533],{"code":533},"progress.md",[138,535,536],{},[113,537,538],{"code":538},"todo.json",[138,540,541],{},"验收 checklist",[15,543,544],{},"这是最轻量、但很有用的 harness 资产。",[187,546,548],{"id":547},"_3-把验证能力接进来","3. 把验证能力接进来",[15,550,551],{},"常见问题不是“不会做”，而是“做完不验”。优先接这些验证链路：",[135,553,554,557,560,563],{},[138,555,556],{},"测试命令",[138,558,559],{},"lint",[138,561,562],{},"日志查询",[138,564,565],{},"页面截图或浏览器自动化",[187,567,569],{"id":568},"_4-把知识沉到仓库里","4. 把知识沉到仓库里",[15,571,572,575],{},[24,573,574],{},"Agent 看不见的东西，对它来说就不存在。"," 所以规则要尽量沉到仓库：",[135,577,578,581,584,587],{},[138,579,580],{},"仓库内文档",[138,582,583],{},"设计决策记录",[138,585,586],{},"规范文件",[138,588,589],{},"可执行脚本",[187,591,593],{"id":592},"_5-让-harness-尽量简单但要闭环","5. 让 Harness 尽量简单，但要闭环",[15,595,596],{},"Anthropic 的提醒也很关键：harness 每多一个组件，都是在编码一种会过时的假设。",[15,598,599],{},"所以我的策略不是堆复杂度，而是：",[135,601,602,605,608],{},[138,603,604],{},"先做最小可用 loop",[138,606,607],{},"只在真实失败点上补机制",[138,609,610],{},"每次新增组件都问一句：它是不是还在提供真实价值",[47,612,613],{"id":613},"一个更实用的判断标准",[15,615,616],{},"以后再听到别人聊 Harness Engineering，我会先问这三个问题：",[246,618,619,622,625],{},[138,620,621],{},"这个系统给了 Agent 什么可执行能力？",[138,623,624],{},"它怎么维护多轮状态和交接？",[138,626,627],{},"它怎么验证结果并决定继续还是停止？",[15,629,630],{},"这三件事说不清，通常还停留在“Prompt + Tool Use”阶段。",[47,632,633],{"id":633},"最后总结",[15,635,636],{},"看完这支视频，再对照 OpenAI 和 Anthropic 的公开实践，我现在的结论是：",[36,638,639],{},[15,640,641],{},"它不是一个营销词，也不是 Prompt Engineering 的同义替换；它描述的是一整套围绕 Agent Runtime 的工程工作，目标是让模型在真实环境中稳定、可控、可验证地完成多步任务。",[15,643,644],{},"我不太把 Harness Engineering 当成新名词，而把它当成一个工程提醒：",[36,646,647],{},[15,648,649],{},"到了 Agent 阶段，真正的杠杆不只在语言，而在系统。",{"title":164,"searchDepth":651,"depth":651,"links":652},4,[653,655,656,665,666,667,674,675],{"id":49,"depth":654,"text":50},2,{"id":118,"depth":654,"text":119},{"id":181,"depth":654,"text":182,"children":657},[658,660,661,662,663,664],{"id":189,"depth":659,"text":190},3,{"id":213,"depth":659,"text":214},{"id":240,"depth":659,"text":241},{"id":295,"depth":659,"text":296},{"id":339,"depth":659,"text":340},{"id":376,"depth":659,"text":377},{"id":410,"depth":654,"text":411},{"id":424,"depth":654,"text":425},{"id":486,"depth":654,"text":487,"children":668},[669,670,671,672,673],{"id":493,"depth":659,"text":494},{"id":517,"depth":659,"text":518},{"id":547,"depth":659,"text":548},{"id":568,"depth":659,"text":569},{"id":592,"depth":659,"text":593},{"id":613,"depth":654,"text":613},{"id":633,"depth":654,"text":633},[677],"开发",true,"2026-04-09 22:20:00","结合一支讲解 Harness Engineering 的视频，以及 OpenAI、Anthropic 在 2025-2026 年公开披露的实践，系统梳理 Harness 的定义、演进、核心组成、常见误区与落地方法。",false,"md",null,{"slots":685},{},"/2026/harness-engineering",{"text":688,"minutes":689,"time":690,"words":691},"12 min read",11.255,675300,2251,[693,696,699,702,705,708,711],{"title":694,"link":695},"Bilibili · 最近爆火的 Harness Engineering 到底是啥？一期讲透！","https://www.bilibili.com/video/BV1Zk9FBwELs",{"title":697,"link":698},"OpenAI · Harness engineering: leveraging Codex in an agent-first world","https://openai.com/index/harness-engineering",{"title":700,"link":701},"Anthropic · Effective context engineering for AI agents","https://www.anthropic.com/engineering/effective-context-engineering-for-ai-agents",{"title":703,"link":704},"Anthropic · Effective harnesses for long-running agents","https://www.anthropic.com/engineering/effective-harnesses-for-long-running-agents",{"title":706,"link":707},"Anthropic · Harness design for long-running application development","https://www.anthropic.com/engineering/harness-design-long-running-apps",{"title":709,"link":710},"Anthropic · Demystifying evals for AI agents","https://www.anthropic.com/engineering/demystifying-evals-for-ai-agents",{"title":712,"link":713},"Anthropic Docs · Claude Managed Agents overview","https://platform.claude.com/docs/en/managed-agents/overview",{"title":5,"description":680},{"loc":686},"posts/2026/harness-engineering",[718,719,720,5,721,722,723],"AI","Agent","Harness","Claude Code","OpenAI","Anthropic","tech","AkOypXzHe8TNb7zVbZrZdaXxngEJDhpIzQg9yZvnFgI",[727,732],{"title":728,"path":729,"stem":730,"date":731,"type":724,"children":-1},"Java 的 ACM 常用模板","/2026/acm-java-template","posts/2026/acm-java-template","2026-04-04 15:36:18",{"title":733,"path":734,"stem":735,"date":736,"type":737,"children":-1},"单纯想写点什么","/2026/say-sth-to-me","posts/2026/say-sth-to-me","2026-05-08 00:13:14","story",1778311048012]