{"id":4568,"date":"2024-09-24T17:05:49","date_gmt":"2024-09-24T09:05:49","guid":{"rendered":"http:\/\/www.sniper97.cn\/?p=4568"},"modified":"2025-04-08T20:38:17","modified_gmt":"2025-04-08T12:38:17","slug":"%e3%80%90%e6%b7%b1%e5%ba%a6%e5%ad%a6%e4%b9%a0%e7%ac%94%e8%ae%b0%e3%80%91%f0%9f%8d%93o1%e4%b9%8b%e6%9d%a5%e9%be%99%e5%8e%bb%e8%84%89","status":"publish","type":"post","link":"http:\/\/www.sniper97.cn\/index.php\/note\/deep-learning\/note-deep-learning\/4568\/","title":{"rendered":"\u3010\u6df1\u5ea6\u5b66\u4e60\u7b14\u8bb0\u3011\ud83c\udf53o1\u4e4b\u6765\u9f99\u53bb\u8109"},"content":{"rendered":"\n<p><strong>\u6ce8\uff1a\u672c\u6587\u5199\u4e8e2024\u5e749\u670824\u65e5\uff0c\u6709\u4e9b\u6280\u672f\u731c\u6d4b\u53ef\u80fd\u5df2\u7ecf\u88ab\u8bc1\u4f2a\u6216\u8fc7\u65f6\u3002<\/strong>\u4f46\u662f\u4e3b\u7ebf\u6280\u672f\u6ca1\u95ee\u9898\uff0c\u53ea\u662f\u6700\u540e\u7684\u6280\u672f\u731c\u6d4b\u53ef\u80fd\u6709\u9519\u3002<\/p>\n\n\n\n<p>\u672c\u6587\u5c06\u8bd5\u56fe\u4ee5OpenAI\u5386\u5e74\u6765\u7684\u7814\u53d1\u601d\u8def\u53d8\u5316\u4e3a\u4e3b\u7ebf\uff0c\u6d89\u53ca\u5230\u53ef\u80fd\u7684\u6280\u672f\u4ee5\u53ca\u6269\u5c55\uff0c\u76f4\u81f3o1\u7684\u51fa\u73b0\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\" id=\"oILZl\">1. \u56de\u987e\u5386\u53f2\uff1a\u5982\u4f55\u63d0\u9ad8\u6a21\u578b\u7684\u63a8\u7406\u80fd\u529b\uff1f<\/h1>\n\n\n\n<p id=\"u0ceb7741\">\u5f88\u591a\u4efb\u52a1\u4f8b\u5982\u4ee3\u7801\u3001\u6570\u5b66\u7b49\u90fd\u9700\u8981\u6a21\u578b\u5177\u5907\u4e00\u5b9a\u7684\u590d\u6742\u63a8\u7406\u80fd\u529b\u3002\u90a3\u5982\u4f55\u8ba9\u6a21\u578b\u80fd\u591f\u505a\u597d\u6570\u5b66\u9898\u5462\uff1f<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"s2k9g\">1.1. \u5982\u4f55\u5b8c\u6210\u4e2d\u7b49\u96be\u5ea6\u7684\u6570\u5b66\u95ee\u9898\uff1f<\/h2>\n\n\n\n<p id=\"u6525b4b4\">\u4e00\u4e2a\u5f88\u81ea\u7136\u800c\u7136\u7684\u60f3\u6cd5\u662f\u5bf9\u6570\u5b66\u4efb\u52a1\u8fdb\u884c\u5fae\u8c03\uff0c\u8fd9\u4e5f\u662f\u73b0\u5728\u5f88\u591axx\u65b9\u5411\u5782\u57df\u5f3a\u5316\u5728\u505a\u7684\u4e8b\u60c5\u3002<\/p>\n\n\n\n<p id=\"u881f8929\">\u9996\u5148\u5bf9\u6a21\u578b\u8fdb\u884c\u5fae\u8c03[1]\uff0c\u53d1\u73b0\u5982\u679c\u60f3\u5728\u4f8b\u5982GSM8k\u4e0a\u8fd9\u7c7b\u4e2d\u7b49\u96be\u5ea6\u6570\u5b66\u9898\u4e0a\u8fbe\u523080%\u7684\u51c6\u786e\u7387\uff0c\u6a21\u578b\u5c3a\u5bf8\u81f3\u5c11\u8981\u653e\u5927\u523010^16\u7684\u91cf\u7ea7\uff0c\u6216\u8005\u6570\u636e\u91cf\u589e\u52a02\u4e2a\u6570\u91cf\u7ea7\u3002\u4f46\u662f\u65e0\u8bba\u4f55\u79cd\u65b9\u5f0f\uff0c\u90fd\u662f\u4e0d\u53ef\u63a5\u53d7\u7684\u6210\u672c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728114310677-20b4975c-a4e9-43c0-834b-f41c70889a2d.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u6f32e33b\">\u968f\u7740\u8bad\u7ec3epoch\u7684\u589e\u52a0PASS@1\u6301\u7eed\u4e0a\u6da8\uff0cPASS@100\u6301\u7eed\u4e0b\u964d\uff0c\u8bf4\u660e\u6a21\u578b\u7684\u89e3\u7a7a\u95f4\u4e0d\u65ad\u53d8\u5c0f\uff0c\u7b54\u6848\u7684\u591a\u6837\u6027\u6301\u7eed\u4e0b\u964d\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728114402126-0d3dec57-0a6b-4c77-8435-f3a4fc6cdfde.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"ue4562658\">\u6574\u4f53\u770b\u901a\u8fc7\u5fae\u8c03\uff08pt scaling law\uff09\u5e26\u6765\u7684\u6027\u80fd\u4e0a\u5347\u5e26\u6765\u7684\u6536\u76ca\u4f3c\u4e4e\u662f\u6709\u9650\u7684\uff0c\u90a3\u65e2\u7136\u5728PASS@100\u7684epoch 2\u9644\u8fd1\u8fbe\u5230\u4e86\u6700\u9ad8\u7684\u901a\u8fc7\u738784%\uff0c\u662f\u5426\u53ef\u4ee5\u901a\u8fc7\u8bad\u7ec3\u4e00\u4e2a\u6a21\u578b\u901a\u8fc7\u5224\u65ad\u56de\u7b54\u662f\u5426\u6b63\u786e\u5728100\u4e2a\u7b54\u6848\u4e2d\u8ba9\u6a21\u578b\u9009\u62e9\u6b63\u786e\u7684\u4e00\u4e2a\u4f5c\u4e3a\u6a21\u578b\u6700\u7ec8\u7684\u8f93\u51fa\u5462\uff1f<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728115326606-050170c2-9ab6-4b76-a43d-817aa3c96eb9.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u1954a9ae\">\u9996\u5148[1]\u8bad\u7ec3\u4e86\u4e00\u4e2a\u5bf9\u7ed3\u679c\u6b63\u786e\u4e0e\u5426\u8fdb\u884c\u5224\u522b\u7684\u5224\u522b\u5668\u2014\u2014ORM\uff08Outcome-supervised Reward Models\uff09\uff0c\u6765\u76f4\u63a5\u5bf9\u6a21\u578b\u7ed3\u679c\u8fdb\u884c\u5224\u65ad\u3002\u8be5\u5224\u522b\u5668\u901a\u8fc7\u5bf9\u7ed3\u679c\u662f\u5426\u51c6\u786e\u7684\u4e8c\u5206\u7c7b\u4e3a\u76ee\u7684\u8fdb\u884c\u8bad\u7ec3\u3002<\/p>\n\n\n\n<p id=\"u71db7f5a\">At training time, the reward model makes predictions for every token in the context. The target for each token in a solution is the same, based on whether the solution is labelled correct or incorrect.[2]<\/p>\n\n\n\n<p id=\"u52c339e6\">\u7531\u4e8e\u6570\u636e\u5fae\u8c032\u4e2aepoch\u7684PASS@100\u662f\u6700\u9ad8\u7684\uff0c\u56e0\u6b64\u4f7f\u7528\u8be5checkpoint\u91c7\u6837100\u4e2a\uff0c\u901a\u8fc7\u5bf9\u7ed3\u679c\u8fdb\u884c\u7b80\u5355\u5224\u65ad\uff0c\u8bad\u7ec3\u4e86\u4e00\u4e2a\u5224\u522b\u5668\uff0c\u53d1\u73b0verification\u4efb\u52a1\u5177\u6709\u8f83\u5927\u7684\u96be\u5ea6\uff0c\u56e0\u6b64\u5728\u524d\u671f\u7684\u6027\u80fd\u8f83\u5dee\uff0c\u4f46\u662f\u968f\u7740\u6570\u636e\u89c4\u6a21\u7684\u4e0a\u5347\uff0cverification\u7684\u6a21\u578b\u53ef\u4ee5\u5f88\u5feb\u7684\u8d85\u8fc7finetuning\u7684\u6a21\u578b\uff0c\u5e76\u4e14\u4f18\u52bf\u6301\u7eed\u6269\u5927\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728115400426-a927bf06-0359-45de-aced-92d0badbb6e8.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u5aa3143b\">\u540c\u65f6\uff0c\u5206\u7c7b\u4efb\u52a1\u4e0d\u4ec5\u53ef\u4ee5\u4ee5solution\u7ea7\u522b\u6765\u8fdb\u884c\u5206\u7c7b\uff0c\u540c\u6837\u53ef\u4ee5\u4ee5token\u7ea7\u522b\u6765\u8fdb\u884c\u5206\u7c7b\u3002token\u7ea7\u522b\u4f1a\u66f4\u52a0\u5173\u6ce8\u8fc7\u7a0b\u4e00\u4e9b\uff0c\u6548\u679c\u4e5f\u4f1a\u66f4\u597d\u4e00\u4e9b\uff08\u6570\u636e\u5229\u7528\u7387\u4e5f\u66f4\u9ad8\uff09\uff0c\u4e0d\u8fc7\u663e\u7136\uff0ctoken\u7ea7\u522b\u7684\u5224\u522b\u662f\u66f4\u96be\u7684\uff0c\u56e0\u6b64ORM_token\u524d\u671f\u7684\u6548\u679c\u5176\u5b9e\u662f\u4e0d\u5982ORM_solution\u7684\u6548\u679c\u597d\uff0c\u4f46\u662f\u968f\u7740\u8bad\u7ec3\u6570\u636e\u7684\u589e\u52a0\uff0c\u663e\u7136ORM_solution\u5728\u6162\u6162\u7684\u88abhack\uff0c\u6548\u679c\u5728\u4e0b\u964d\uff0c\u800cORM_token\u7ea7\u522b\u7684\u6548\u679c\u5219\u66f4\u597d\u4e5f\u66f4\u9c81\u68d2(\u4e0b\u56fea\uff09\u3002\u540c\u65f6\u8fd8\u9a8c\u8bc1\u4e86\u5c06\u9884\u8bad\u7ec3\u4efb\u52a1\u548c\u5224\u522b\u4efb\u52a1\u878d\u5408\u8d77\u6765\u8bad\u7ec3\uff08\u7c7b\u4f3c\u4e8eBert\u7684MLM\u548cNSP\uff09\u6548\u679c\u4f1a\u66f4\u597d\u4e00\u70b9(\u4e0b\u56feb)\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728118692191-2e92631e-c9d5-42c6-8f26-af597a896beb.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"uaf391e4b\">\u53e6\u5916\uff0c\u89c2\u5bdf\u4e0a\u56fec\uff0c\u53ef\u4ee5\u603b\u7ed3\u5230\uff1a\u4e4b\u6240\u4ee5ORM\u6709\u6548\uff0c\u53ef\u80fd\u7684\u539f\u56e0\u67092\u4e2a\uff1a<\/p>\n\n\n\n<ul>\n<li>\u6a21\u578b\u5728\u751f\u6210\u5230\u540e\u9762\u4e4b\u540e\u5176\u5b9e\u662f\u53ef\u4ee5\u53d1\u73b0\u524d\u9762\u662f\u9519\u8bef\u7684\uff0c\u4f46\u662f\u7531\u4e8e\u4e0d\u80fd\u8fdb\u884cbacktrack\uff0c\u56e0\u6b64\u53ea\u80fd\u5c06\u9519\u5c31\u9519\uff0c\u4ece\u800c\u4ea7\u751f\u9519\u8bef\u6216\u8005\u5e7b\u89c9\u3002<\/li>\n\n\n\n<li>\u6a21\u578b\u5b66\u5230\u4e86\u4e00\u4e9b\u8f83\u4e3a\u7b80\u5355\u7684\u542f\u53d1\u5f0f\u89c4\u5219\u7b49\uff0c\u4ece\u800c\u5141\u8bb8\u5c0f\u6a21\u578b\uff086B\uff09\u4f9d\u7136\u53ef\u4ee5\u6821\u6b63\u5927\u6a21\u578b\uff08175B\uff09\u7684\u8f93\u51fa\u3002<\/li>\n<\/ul>\n\n\n\n<p id=\"u985da394\">\u5230\u8fd9\u91cc\uff0c\u5c31\u53ef\u4ee5\u901a\u8fc7\u5728100\u4e2a\u91c7\u6837\u4e2d\u901a\u8fc7\u4e00\u4e2areward model\u627e\u51fa\u6700\u6b63\u786e\u7684\u90a3\u4e2aresponse\uff0c\u505a\u5230\u6574\u4e2a\u6a21\u578b\u7684\u8f93\u51fa\u6b63\u786e\u3002\u76f8\u6bd4\u53ea\u91c7\u68371\u4e2a\u7b54\u6848\uff0c\u91c7\u6837100\u4e2a\u7b54\u6848\u5728\u63a8\u7406\u9636\u6bb5\u7684\u7b97\u529b\u8981\u6c42\u66f4\u9ad8\uff0c\u4f46\u662f\u66f4\u51c6\u786e\u3002\u56e0\u6b64\u5f15\u51fa\u4e00\u4e2a\u65b0\u7684scaling law\u2014\u2014<strong>inference time scaling law<\/strong>\u3002<\/p>\n\n\n\n<p id=\"u5adc0409\"><strong>\u53e6\u5916\uff0c\u7531\u4e8everification\u5b9e\u9645\u4e0a\u7c7b\u4f3c\u4e8e\u73b0\u5728\u7684reward model\uff0c\u56e0\u6b64\u731c\u6d4b\u4ece\u8fd9\u65f6\u5f00\u59cb\uff0cOpenAI\u5c31\u5df2\u7ecf\u610f\u8bc6\u5230pre-training\u9636\u6bb5\u7684\u7b97\u529bROI\u662f\u8fdc\u4f4e\u4e8epost-training\uff08RL\uff09\u9636\u6bb5\u7684ROI\u7684\u3002<\/strong><\/p>\n\n\n\n<p id=\"u8c7aff24\">\u8bba\u6587\u901a\u8fc7\u8fd9\u6837\u4e00\u6761pipeline\uff0c\u8ba9\u6a21\u578b\u5177\u5907\u4e86\u81ea\u6211\u6279\u8bc4\u5e76\u6539\u6b63\u7684\u80fd\u529b\uff0c\u6a21\u578b\u8d8a\u5927\u81ea\u6211\u6279\u8bc4\u4e0e\u6539\u6b63\u7684\u80fd\u529b\u8d8a\u5f3a\u3002\u540c\u65f6\u8bba\u6587\u4e2d\u505a\u4e86\u4e00\u4e9b\u5bf9\u6bd4\u5b9e\u9a8c\uff0c\u6574\u4f53\u770b\u5728pipeline\u4e2d\u8ba9\u6a21\u578b\u505a\u4e00\u4e2a\u4e8c\u5206\u7c7b\u6548\u679c\u662f\u6bd4\u4e0d\u505a\u66f4\u597d\u7684\uff0c\u5e76\u4e14\u8fd9\u4e2a\u4e8c\u5206\u7c7b\u7684logits\u672c\u8eab\u4e5f\u662f\u4e00\u4e2a\u4fe1\u53f7\uff0c\u53ef\u4ee5\u7528\u6765\u505aBoN\u7b5b\u9009\u3002<\/p>\n\n\n\n<p id=\"u23ebda49\">\u4f46\u662f\u7531\u4e8e\u8fd9\u7bc7\u8bba\u6587\u662f22\u5e74\u7684\u8bba\u6587\uff0c\u5f53\u65f6\u6a21\u578b\u80fd\u529b\u8fd8\u76f8\u5bf9\u8f83\u5dee\uff08InstructGPT\uff09\uff0c\u540e\u7eedOpenAI\u7684\u5de5\u4f5c\u6539\u8fdb\u4e86\u8fd9\u5957pipeline\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" id=\"qikTl\"\/>\n\n\n\n<p id=\"u5567f73f\">OpenAI\u572824\u5e74\u53c8\u53d1\u5e03\u4e86\u4e00\u7bc7\u7c7b\u4f3c\u7684\u6587\u7ae0CriticGPT[8]\uff0c\u53bb\u6389\u4e86\u4e8c\u5206\u7c7b\u5224\u522b\uff0c\u5e76\u4e14\u4f7f\u7528\u4e86\u65b0\u7684RLHF\u6d41\u7a0b\u3002<\/p>\n\n\n\n<p id=\"u312e00e7\">\u8fd9\u7bc7\u8bba\u6587\u7684\u5de5\u4f5c\u4e3b\u8981\u5728Code\u4e0a\u3002<\/p>\n\n\n\n<p id=\"ud2407ce3\">\u7531\u4e8e\u73b0\u6709\u76844\/4o\u5bf9\u4e8e\u4ee3\u7801\u95ee\u9898\u4f9d\u7136\u5e7b\u89c9\u6bd4\u8f83\u4e25\u91cd\uff0c\u5e76\u4e14\u6709\u65f6\u5019\u4f1a\u5e26\u6709\u8fc7\u5206\u7684\u5439\u6bdb\u6c42\u75b5\uff0c\u56e0\u6b64\u8bad\u7ec3\u4e00\u4e2a\u6a21\u578b\u7528\u6765\u505acode review\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728368279110-c874c5f5-1d93-44ba-b537-859e89538461.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u98ce1da6\">\u8be5\u8bba\u6587\u7684\u505a\u6cd5\u548cChatGPT\u7684RLHF\u6bd4\u8f83\u7c7b\u4f3c\uff1a<\/p>\n\n\n\n<ol>\n<li>\u4f7f\u7528\u4e00\u7ec4prompt\u5728GPT4\u4e0a\u751f\u6210n\u7ec4QA\u5bf9\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528\u4eba\u5de5\u6807\u6ce8\u8fd9\u4e9bQA\u5bf9\uff0c\u4e3a\u8fd9\u4e9bA\u7f16\u5199critic\u3002\u5305\u62ecChatGPT\u672c\u8eab\u5b58\u5728\u7684bug\u548c\u4e3b\u52a8\u63d2\u5165\u5047\u88c5\u662fChatGPT\u8f93\u51fa\u7684bug\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528\u8fd9\u4e9b\u6807\u6ce8\u540e\u7684critic\uff0c\u4eba\u5de5\u8fdb\u884c\u8bc4\u5206\uff0c\u8bad\u7ec3reward model\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528PPO\u8bad\u7ec3Actor\u6a21\u578b\u3002<\/li>\n<\/ol>\n\n\n\n<p id=\"u36fad8d6\">\u8fd9\u4e2a\u8fc7\u7a0b\u4f7f\u7528\u4e86\u4e00\u4e2a\u65b0\u7684\u89e3\u7801\u65b9\u5f0f\uff0c\u7528\u6765\u5e73\u8861\u5439\u6bdb\u6c42\u75b5\u548ccritic\u7684\u6709\u5e2e\u52a9\u6027\u2014\u2014<strong>Force Sampling Beam Search\uff08FSBS\uff09<\/strong>\u3002\u5176\u5b9e\u8fd9\u91cc\u5176\u5b9e\u5c31\u548co1\u7684\u6574\u4f53\u8bad\u7ec3\/\u89e3\u7801\u65b9\u5f0f\u6bd4\u8f83\u50cf\u4e86\uff0c\u6bd5\u7adf\u4e5f\u662f\u76f8\u8fd1\u7684\u5de5\u4f5c\uff0c\u540e\u9762\u4f1a\u63d0\u5230\u3002<\/p>\n\n\n\n<p id=\"u153f5dbd\">\u4ecb\u7ecd\u4e00\u4e0b\u8fd9\u4e2aFSBS\uff0c\u5982\u4f55\u5e73\u8861\u6709\u5e2e\u52a9\u6027\u548c\u5439\u6bdb\u6c42\u75b5\u5462\uff1f\u90a3\u5c31\u662f\u7528\u957f\u5ea6\u60e9\u7f5a\u3002<\/p>\n\n\n\n<p id=\"ub65106d1\">\u5728\u89e3\u7801\u65f6\uff0c\u4f7f\u7528\u4e86\u5982\u4e0b\u673a\u5236\uff1a<\/p>\n\n\n\n<ul>\n<li>\u9996\u5148\u5bf9\u6bcf\u4e00\u4e2aQA\uff0c\u5f3a\u5236\u6dfb\u52a0&#8220;`\uff08\u8f93\u51facritic\u7684pattern\uff09\uff0c\u8ba9\u6a21\u578b\u8f93\u51fa4\u4e2aresponse\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528reward model\uff0c\u9009\u62e9\u5176\u4e2d\u6700\u4f18\u76842\u4e2acritic\u3002<\/li>\n\n\n\n<li>\u5bf9\u8fd9\u4e9b\u7b54\u6848\u5220\u9664EOS\uff0c\u6dfb\u52a0&#8220;`\u7ee7\u7eed\u5f3a\u5236\u8f93\u51facritic\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528reward model\uff0c\u9009\u62e9\u5176\u4e2d\u6700\u4f18\u76842\u4e2acritic\u3002<\/li>\n<\/ul>\n\n\n\n<p id=\"u86c23313\">\u6301\u7eed\u8fed\u4ee34\u6b21\uff0c\u603b\u8ba1\u4ea7\u751f28\u4e2aresponse\u3002\u8fd9\u4e9bresponse\u5bf9\u540c\u4e00\u4e2aprompt\u8f93\u51fa\u4e86\u4e0d\u540c\u6570\u91cf\u7684critic\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728466796303-ee904b8d-3d0b-4654-b37d-4ed44f995d6f.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u1b6da821\">\u968f\u540e\u5bf9\u591acritic\u548c\u957fcritic\u7684response\u8fdb\u884c\u60e9\u7f5a\uff0c\u4ee5\u5e73\u8861\u6709\u5e2e\u52a9\u6027\u548c\u5439\u6bdb\u6c42\u75b5\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/d3ed0faaf5ad04223c03066548074bd9.svg\" alt=\"\"\/><\/figure>\n\n\n\n<p id=\"uc2f16f3c\">\u56e0\u6b64\uff0c\u5982\u679c\u6a21\u578b\u60f3\u8f93\u51fa\u8f83\u591a\u6570\u91cf\u7684critic\uff0c\u5c31\u5fc5\u987b\u663e\u8457\u63d0\u9ad8critic\u7684\u8d28\u91cf\uff0c\u6709\u6548\u7684\u5e73\u8861\u4e86\u8f93\u51fa\u7684\u8d28\u91cf\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728543075830-97e625b0-f409-450a-8ad5-767d589ce1b8.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" id=\"Tb7OX\"\/>\n\n\n\n<p id=\"u780c6cf6\">\u5c31\u8fd9\u4e9b\u4e86\u5417\uff1f\u76ee\u524d\u4e3a\u6b62\u6240\u6709\u7684\u5b9e\u9a8c\u51b7\u542f\u52a8\u90fd\u9700\u8981\u5927\u91cf\u7684\u4eba\u5de5\u6807\u6ce8\uff0cCriticGPT\u7684\u6807\u6ce8\u4f7f\u7528\u4e86\u5177\u67095\u5e74Python\u7ecf\u9a8c\u7684\u4e13\u4e1a\u4eba\u5458\uff0c\u6bcf\u6761\u5e73\u5747\u8017\u65f650\u5206\u949f\u3002\u6807\u6ce8\u6210\u672c\u6781\u5176\u6602\u8d35\u3002\u90a3\u6709\u4ec0\u4e48\u6709\u4fbf\u5b9c\u4e00\u70b9\u7684\u65b9\u6848\u5462\uff1f<\/p>\n\n\n\n<p id=\"ub78931a3\">Anthropic\u8ba4\u4e3a\uff0c\u4e3a\u4ec0\u4e48\u6211\u4eec\u8fd8\u8981\u8d39\u5f88\u591a\u529b\u6c14\u544a\u8bc9\u6a21\u578b\u4ec0\u4e48\u662f\u5bf9\u7684\u4ec0\u4e48\u662f\u9519\u7684\uff1f\u4eba\u7c7b\u5f88\u591a\u65f6\u5019\u53ea\u9700\u8981\u4e00\u4e2a\u5f88\u7c97\u7684\u201cConstitutional\u201d\u5c31\u53ef\u4ee5\u754c\u5b9a\u4ec0\u4e48\u4e8b\u60c5\u53ef\u4ee5\u505a\u4ec0\u4e48\u4e8b\u60c5\u4e0d\u53ef\u4ee5\u505a\uff0c\u800c\u4e0d\u7528\u4e8b\u65e0\u5de8\u7ec6\u7684\u544a\u8bc9\u4f60\u4e0d\u53ef\u4ee5\u62a2\u94f6\u884c\u4e0d\u53ef\u4ee5\u641e\u7834\u574f\u3002<\/p>\n\n\n\n<p id=\"u70268148\">\u56e0\u6b64Anthropic\u63d0\u51fa\u4e86AI\u5baa\u6cd5(Constitutional AI)\uff0c\u5168\u6d41\u7a0b\u90fd\u4f7f\u7528AI\u76d1\u7763AI\uff0c\u6ca1\u6709\u4eba\u5de5\u4ecb\u5165\uff0c\u56e0\u6b64Anthropic\u79f0\u8fd9\u79cd\u8303\u5f0f\u4e3aRLAIF (RL from AI Feedback)\u3002<\/p>\n\n\n\n<p id=\"u6eeb4eb0\">\u9996\u5148\u4f7f\u7528\u4e00\u4e2a\u672a\u7ecf\u5b89\u5168\u5bf9\u9f50\u7684RLHF\uff08Helpful RLHF\uff09\u6a21\u578b\uff0c\u4f7f\u7528\u4e00\u4e9b\u5bf9\u6297prompt\uff08\u5bb9\u6613\u4ea7\u751f\u6709\u5bb3\u8f93\u51fa\uff09\u83b7\u5f97\u4e00\u4e2a\u6709\u5bb3\u7684response\uff0c\u901a\u8fc7\u5b9a\u4e49\u201c\u5baa\u6cd5\u201d\u8ba9\u6a21\u578b\u81ea\u6211\u6279\u8bc4\/\u6539\u5199\uff0c\u83b7\u5f97\u4e00\u4e2a\u65e0\u5bb3\u7684SL-CAI(Supervised Learning Constitutional AI)\u6a21\u578b\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728544623489-aeef69f4-a645-4fef-b6a8-2f7c7b72ded8.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u4c08d134\">\u968f\u540e\u7ee7\u7eed\u4f7f\u7528SL-CAI\u751f\u6210QA\u5bf9\uff0c\u7ee7\u7eed\u7531\u6a21\u578b\u81ea\u5df1\u6253\u6807\u7b7e\u8ba4\u4e3a\u54ea\u4e00\u4e2a\u7b54\u6848\u662f\u66f4\u4f18\u79c0\u7684\u4e00\u4e2a\u3002\u6700\u7ec8\u4f7f\u7528\u8fd9\u4e2aRM\u7684soft label\u5230\u5e38\u89c4\u7684RLH(AI)F\u7684\u6d41\u7a0b\u4e2d\u3002\u83b7\u5f97RL-CAI\u6a21\u578b\u3002\u540c\u65f6\u8fd9\u4e2a\u8fc7\u7a0b\u4e5f\u53ef\u4ee5\u4e3b\u52a8\u6dfb\u52a0\u201cthink step by step\u201d\u83b7\u5f97CoT\u7248\u672c\u7684RL-CAI\u3002<\/p>\n\n\n\n<p id=\"u1e7f7b49\">\u6a21\u578b\u6700\u7ec8\u7684\u6548\u679c\u6709\u5e2e\u52a9\u6027\u4ecb\u4e8e\u672a\u7ecf\u8fc7\u65e0\u5bb3\u6027\u5fae\u8c03\u7684RLHF\uff08Helpful RLHF\uff09\u548c\u7ecf\u8fc7\u6709\u5bb3\u6027\u5fae\u8c03RLHF\uff08HH RLHF\uff09\u6a21\u578b\u7684\u4e2d\u95f4\u3002\u540c\u65f6\u65e0\u5bb3\u6027\u663e\u8457\u5f3a\u4e8e\u4e24\u8005\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728545082051-782d00f2-29ce-4870-9aca-bd3f9036bfd9.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u107fa656\">\u8fd9\u7bc7\u6587\u7ae0\u63d0\u4f9b\u4e86\u4e00\u4e2a\u4f4e\u6210\u672c\u7684\u6570\u636e\u8fed\u4ee3\u89c6\u89d2\uff0c\u4e0d\u518d\u5c40\u9650\u4e8e\u6709\u660e\u786e\u6b63\u786e\/\u9519\u8bef\u7684\u7406\u5de5\u79d1\uff0c\u53ef\u4ee5\u8ba9\u6a21\u578b\u81ea\u6211\u6279\u5224\u76f8\u5bf9\u7075\u6d3b\u7684guideline\uff0c\u5e76\u4e14\u9a8c\u8bc1\u4e86\u6709\u6548\u6027\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"I5piF\">1.2.2. \u8fc7\u7a0b\u6b63\u786e<\/h3>\n\n\n\n<p id=\"u012c7613\">\u4e0a\u9762\u63d0\u5230\u4e86\u4f7f\u7528ORM\u6765\u505a\u6a21\u578b\u56de\u7b54\u662f\u5426\u6b63\u786e\u7684\u5224\u522b\u65b9\u5f0f\uff0c\u867d\u7136\u6211\u4eec\u5df2\u7ecf\u505a\u5230\u4e86\u7ed3\u679c\u6b63\u786e\uff0c\u4f46\u662fORM\u8fd9\u91cc\u663e\u7136\u6709\u5b58\u5728\u4e2a\u95ee\u9898\uff0c\u5c31\u662f\u6a21\u578b\u751f\u6210\u5b58\u5728\u8fc7\u7a0b\u662f\u9519\u8bef\u7684\uff0c\u4f46\u662f\u7ed3\u679c\u662f\u5bf9\u7684\u60c5\u51b5\u3002\u8fd9\u90e8\u5206\u6570\u636e\u4f1a\u5bf9\u6a21\u578b\u7684\u5224\u65ad\u4ea7\u751f\u5f71\u54cd\uff0c\u540c\u65f6\u8fd8\u6709\u66f4\u4e00\u5b9a\u7684CAP(Credit Assignment Problem)[3][4]\u95ee\u9898\uff0c\u65e2\u7ed3\u679c\u662f\u5bf9\u7684\uff0c\u4f46\u662f\u7a76\u7adf\u662f\u54ea\u4e00\u6b65\u5bf9\u7ed3\u679c\u7684\u5f71\u54cd\u662f\u6700\u5927\u7684\uff1f<\/p>\n\n\n\n<p id=\"u69a05ca6\">\u5bf9\u4e8e\u4e00\u4e9b\u5f3a\u63a8\u7406\u95ee\u9898\uff0c\u5f88\u591a\u6587\u7ae0\u8bc1\u660e\u4e86\u5fc5\u987b\u4f7f\u7528\u7c7b\u4f3c\u4e8eCoT[6]\u7684\u505a\u6cd5\u6765\u83b7\u5f97\u8fc7\u7a0b\u5e76\u63d0\u5347\u7ed3\u679c\u7684\u51c6\u786e\u6027\uff0c\u8d8a\u6765\u8d8a\u591a\u7684\u8bba\u6587\u4e5f\u8bc1\u660e\u4e00\u6761\u5f3a\u63a8\u7406\u4efb\u52a1\u5fc5\u987b\u9700\u8981\u4e00\u4e2a\u9ad8\u8d28\u91cfCoT\u624d\u53ef\u4ee5\u5b8c\u6210\uff0c\u5c0f\u6a21\u578b+\u65e0\u9650\u957f\u7684CoT\u53ef\u4ee5\u89e3\u51b3\u4e16\u754c\u4e0a\u4efb\u4f55\u95ee\u9898[11][12]\u3002\u56e0\u6b64\u5982\u679c\u6211\u4eec\u7684\u4e2d\u95f4CoT\u8fc7\u7a0b\u53ef\u4ee5\u50cf\u4e0a\u6587\u7684critics\u4e00\u6837\u6709\u660e\u786e\u7684\u5206\u754c\u7ebf\uff0c\u4f7f\u7528\u4e0a\u6587\u7c7b\u4f3c\u7684thoughts\u9009\u62e9\uff0c\u662f\u4e0d\u662f\u5c31\u53ef\u4ee5\u62ff\u5230\u4e00\u6761\u9ad8\u8d28\u91cf\u63a8\u7406\u94fe\u8def\u5462\uff1f<\/p>\n\n\n\n<p id=\"uf15fd69c\">\u4e5f\u5c31\u662f\u8bf4\uff0c\u5982\u679c\u6709\u4e00\u4e2a\u65b0\u7684\u8303\u5f0f\uff0c\u6a21\u578b\u53ef\u4ee5\u4ee5CoT\u683c\u5f0f\u8f93\u51fa\u83b7\u5f97\u66f4\u4f18\u79c0\u7684\u7ed3\u679c\u7684\u540c\u65f6\uff0c\u4f1a\u6709\u4e00\u4e2a\u6a21\u578b\u4e0d\u65ad\u68c0\u67e5CoT\u8def\u5f84\u662f\u5426\u6b63\u786e\uff0c\u662f\u4e0d\u662f\u5c31\u65e2\u53ef\u4ee5\u8fc7\u7a0b\u6b63\u786e\uff0c\u53c8\u53ef\u4ee5\u7ed3\u679c\u6b63\u786e\u4e86\u5462\uff1f<\/p>\n\n\n\n<p id=\"u18d33ee4\">\u56e0\u6b64OpenAI\u8bbe\u8ba1\u4e86\u4e00\u4e2a\u65b0\u7684reward model\u8bad\u7ec3\u65b9\u5f0f\u2014\u2014PRM[2]\uff08Process-supervised Reward Models)\u3002\u5b83\u901a\u8fc7\u5bf9\u8bc1\u660e\u8fc7\u7a0b\u4e2d\u7684\u6bcf\u4e00\u6b65\u8fdb\u884c\u6807\u6ce8\uff0c\u4ece\u800c\u5224\u65ad\u6bcf\u4e00\u6761CoT\u662f\u5426\u6b63\u786e\u3002\u6807\u7b7e\u5206\u4e3a\u4e09\u7c7b\uff0c\u9664\u4e86\u660e\u786e\u7684\u9519\u8bef\u3001\u6b63\u786e\u4e4b\u5916\uff0c\u8fd8\u65b0\u589e\u4e86\u4e00\u4e2a\u65e0\u6cd5\u5224\u65ad(ambiguity)\u3002<strong>\u5f00\u6e90\u4e86800k\u8fd9\u7c7b\u6570\u636e(PRM800k)\u3002<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728529194625-0f6bc5cc-d05f-4637-b6bd-9e17d6595893.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"ub392b414\">\u76f8\u6bd4ORM\uff0cPRM\u7684\u8bad\u7ec3\u4e5f\u6709\u4e00\u4e9b\u53d8\u5316\u3002\u53ef\u80fd\u662f\u8003\u8651\u5230\u5206\u5e03\u8f6c\u79fb(distribution shift)\uff0cPRM\u6ca1\u6709\u7ee7\u7eed\u4f7f\u7528ORM\u7684\u5206\u7c7b\u4efb\u52a1\uff0c\u800c\u662f\u6cbf\u7528\u4e86Generate\u4efb\u52a1\u3002\u6574\u4f53\u65b9\u5f0f\u53d8\u6210\uff1a<\/p>\n\n\n\n<ul>\n<li>\u6bcf\u4e2a\u4e2d\u95f4step\u4e2d\u4f7f\u7528\u4e00\u4e2a<code>request_rating_token<\/code>\u548c\u4e00\u4e2a<code>placeholder<\/code>\u8fde\u63a5\u3002<\/li>\n\n\n\n<li>\u6bcf\u4e00\u4e2astep\u90fd\u53d6\u51fa<code>placeholder<\/code>\u524d\u7684logit\u7ee7\u7eed\u751f\u6210\u4e0b\u4e00\u4e2atoken\uff0c\u83b7\u5f97rating\u3002<\/li>\n<\/ul>\n\n\n\n<pre id=\"RP3gX\" class=\"wp-block-code\"><code>Train:\n                              --------------------&gt;rating_for_step_1\nquestion_tokens,              |\nsolution_step1_tokens, request_rating_token, placeholder, \nsolution_step2_tokens, request_rating_token, placeholder, \n                              |-------------------&gt; rating_for_step_2\nsolution_step3_tokens, request_rating_token ------&gt; rating_for_step_3\n\n<\/code><\/pre>\n\n\n\n<p id=\"u9767c9cf\">\u901a\u8fc7\u8fd9\u79cd\u65b9\u5f0f\uff0c\u53ef\u4ee5\u901a\u8fc7\u4e00\u6b21\u8ba1\u7b97\u5c31\u641e\u5b9a\u4e00\u4e2asample\u7684\u8bad\u7ec3[31,32]\u3002\u53ea\u4e0d\u8fc7\u9700\u8981\u4e00\u4e9b\u5de5\u7a0b\u652f\u6301\uff0c\u76ee\u524dOpenRLHF\u5df2\u7ecf\u652f\u6301\u4e86\u8fd9\u79cd\u8bad\u7ec3\u65b9\u5f0f\u3002<\/p>\n\n\n\n<p id=\"u8b0c1a7f\">\u76f8\u6bd4ORM\u7684\u8bad\u7ec3\u65b9\u5f0f\uff0c\u4f18\u52bf\u4e3a\uff1a<\/p>\n\n\n\n<ul>\n<li>\u5feb\uff0c\u4e00\u6b21\u8bad\u7ec3n\u4e2astep\uff0c\u4f8b\u5982PRM800K\u5b9e\u9645\u4e0a\u53ea\u670912k\u4e2asample\uff0c\u6781\u5927\u52a0\u901f\u4e86\u8bad\u7ec3\u6d41\u7a0b\u3002<\/li>\n\n\n\n<li>\u6a21\u578b\u7684\u5206\u5e03\u8f6c\u79fb(distribution shift)\u60c5\u51b5\u66f4\u5c0f\uff0c\u6a21\u578b\u66f4\u5bb9\u6613\u8bad\u7ec3\u3002<\/li>\n<\/ul>\n\n\n\n<p id=\"ube5a6215\">\u6574\u4f53\u770bPRM\u7684\u6548\u679c\u663e\u8457\u5f3a\u4e8eORM\u548c\u6295\u7968\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728529222007-2fd8e5fc-be17-439f-9903-d16c4f6a1d1f.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u637be4e3\">\u5e76\u4e14\u7531\u4e8ePRM\u7684\u4ecb\u5165\uff0c\u6a21\u578b\u7684\u5176\u4ed6\u80fd\u529b\u4e5f\u5f97\u5230\u4e86\u4e00\u5b9a\u7684\u4e0a\u6da8\u3002\u6bd4\u5982\u8fd9\u7bc7\u8bba\u6587\u5728Math\u4e0a\u505a\u7684\u5b9e\u9a8c\uff0c\u4e00\u4e9bOOD\u5b9e\u9a8c\u8868\u660e\u5bf9\u4e8e\u7269\u7406\u3001\u5316\u5b66\u7b49\u9886\u57df\u7684\u6027\u80fd\u4e5f\u5f97\u5230\u4e86\u663e\u8457\u589e\u957f\u3002\u8fd9\u79cdcookie\u4e3a\u6a21\u578b\u5176\u4ed6\uff08\u7406\u5de5\u79d1\uff09\u9886\u57df\u7684\u80fd\u529b\u4e0a\u6da8\u63d0\u4f9b\u4e86\u4e00\u5b9a\u7684\u4fbf\u5229\u6027\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728529279589-a4fccbea-8d38-4f3a-b6c6-b8c0b7f1022b.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u32e21359\">\u4e00\u4e2a\u793a\u4f8b\uff0cPRM\u6a21\u578b\u5728\u68c0\u6d4b\u5230\u8ba1\u7b97\u9519\u8bef\u540e\uff0c\u663e\u8457\u964d\u4f4e\u4e86\u540e\u7eed\u6d41\u7a0b\u7684\u5206\u6570\u3002\u8be5\u6d4b\u8bd5\u7528\u4f8b\u53ea\u83b7\u5f97\u4e860.48\u5206\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728529481724-4e4adcf1-2a08-47d9-ab9c-e522033b413f.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"uf1c68e73\">\u5230\u8fd9\u91cc\uff0c\u95ee\u9898\u5c31\u89e3\u51b3\u4e86\uff0c\u6211\u4eec\u81ea\u52a8\u5316\u7684\u62ff\u5230\u4e86\u6b63\u786e\u7684\u7ed3\u679c\u548c\u6b63\u786e\u7684\u8fc7\u7a0b\u3002<\/p>\n\n\n\n<p id=\"u79338d00\">\u4f46\u662f\u4f3c\u4e4e\u8fd8\u6709\u66f4\u597d\u5730\u89e3\u51b3\u65b9\u6848&#8230;.<\/p>\n\n\n\n<p id=\"uf3688fd1\">\u4e0a\u9762\u63d0\u5230\uff0c\u867d\u7136Let&#8217;s Verify Step by Step[2]\u4f7f\u7528\u4e86\u66f4\u52a0\u5148\u8fdb\u7684PRM\u6765\u4f5c\u4e3areward model\uff0c\u4f46\u662fLLM Critics Help Catch LLM Bugs[8]\u8fd8\u6709\u4e00\u4e2a\u6539\u8fdb\u9879\uff0c\u90a3\u5c31\u662fForce Sampling Beam Search\uff08FSBS\uff09\uff0c\u901a\u8fc7sample \u7ea7\u522b\u7684beam search\u6765\u83b7\u5f97\u66f4\u52a0\u4f18\u79c0\u7684\u6574\u4f53\u8f93\u51fa\u3002\u90a3\u4e3a\u4ec0\u4e48\u4e0d\u5c06\u8fd9\u4e24\u4e2a\u7ed3\u5408\u8d77\u6765\u5462\uff1f<\/p>\n\n\n\n<p id=\"ufc11f429\">\u606d\u559c\u4f60\uff0c\u4f60\uff08\u51e0\u4e4e\uff09\u53d1\u73b0\u4e86o1\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" id=\"KC76G\"\/>\n\n\n\n<p id=\"ucecf4a6c\">\u5176\u5b9e\u7eb5\u89c2OpenAI\u8fd9\u51e0\u7bc7\u5de5\u4f5c\u7684\u76ee\u7684\u4ee5\u53ca\u9644\u5f55\u4e2d\u7684\u4e00\u4e9b\u5185\u5bb9\uff0c\u5176\u5b9e\u53ef\u4ee5\u53d1\u73b0OpenAI\u5f88\u591a\u5de5\u4f5c\u90fd\u662f\u4e3a\u4e86\u4eba\u5de5\u6253\u6807\u505a\u7684\u94fa\u57ab\uff0c\u4e0d\u7ba1\u662f\u81ea\u52a8\u6821\u9a8c\u7ed3\u679c\u7684[7]\u8fd8\u662f\u81ea\u52a8code review\u7684[8]\uff0c\u8fd9\u4e24\u7bc7\u5de5\u4f5c[7][8]\u505a\u4e86\u5927\u91cf\u7684\u5bf9\u6bd4human vs human+model\u7684\u6d4b\u8bc4\u6548\u7387\u4ee5\u53ca\u51c6\u786e\u6027\u3002\u8003\u8651\u5230ChatGPT\uff08\u5c24\u5176\u662f3.5\uff09\u4f7f\u7528\u7684\u662fRLHF\u800c\u4e0d\u662fRLAIF[9]\uff0c\u731c\u6d4bOpenAI\u4e00\u5f00\u59cb\u53ef\u80fd\u4e5f\u53ea\u662f\u60f3\u964d\u4f4e\u6807\u6ce8\u6210\u672c\uff0c\u968f\u7740\u6a21\u578b\u6027\u80fd\u7684\u4e0d\u65ad\u63d0\u5347\uff0c\u9010\u6e10\u7684\u53d1\u73b0\u53ef\u4ee5\u7528\u6765\u505aself-critic\u751a\u81f3\u4e8eRLAIF[9]\u5728\u4f5c\u4e3a\u5f3a\u5316\u5b66\u4e60\u7684reward\u4fe1\u53f7\u3002<\/p>\n\n\n\n<p id=\"ua7697212\">\u6574\u4f53\u601d\u8def\u8fd8\u662f\u6781\u5176\u8fde\u8d2f\u7684\uff0c\u56e0\u4e3axx\u6240\u4ee5\u6211\u60f3xx\u3002\u4e0d\u8fc7\u8fd9\u6761\u8def\u8d70\u4e0b\u53bb\u7684\u6311\u6218\u4f9d\u7136\u662f\u4e00\u4e9b\u4e3b\u89c2\u7684\u8bc4\u4ef7\uff0c\u76f8\u6bd4\u7406\u5de5\u79d1\u6b63\u786e\u6027\u5f88\u5f3a\uff0c\u6a21\u578b\u53ef\u4ee5\u5f88\u597d\u7684\u8fdb\u884ccritic\uff0c\u4f8b\u5982\u6587\u5b66\u827a\u672f\u521b\u4f5c\u8fd9\u7c7b\u4e3b\u89c2\u6027\u5f88\u5f3a\u7684\u4efb\u52a1\uff0c\u5982\u4f55\u8fdb\u884c\u81ea\u52a8\u5316\u6d4b\u8bc4\u4f9d\u7136\u662f\u672a\u6765\u4e00\u4e2a\u8f83\u5927\u7684\u6311\u6218\uff0c\u867d\u7136\u4f8b\u5982\u201cAI\u5baa\u6cd5\u201d\u63d0\u4f9b\u4e86\u4e00\u4e2a\u53ef\u80fd\u7684\u8def\u7ebf\uff0c\u4f46\u662f\u4eba\u7c7b\u6734\u7d20\u4ef7\u503c\u89c2\u7684\u76f8\u8fd1\u548c\u5bf9\u521b\u4f5c\u7684\u5f00\u653e\u6001\u5ea6\uff0c\u8fd9\u4f9d\u7136\u662f\u4e00\u4e2a\u5f88\u5927\u7684gap\u3002\u8fd9\u4e00\u70b9\u4e0a\u5373\u4f7f\u662fOpenAI\u76ee\u524d\u4e5f\u6ca1\u6709\u627e\u5230\u6bd4\u8f83\u5408\u9002\u7684reward model\u8bad\u7ec3\u65b9\u6cd5\u3002<\/p>\n\n\n\n<p id=\"ufc25afd8\">\u4ece\u4e0a\u6587\u4e2dOpenAI\u7684\u6280\u672f\u6f14\u8fdb\u8def\u7ebf\u53ef\u4ee5\u53d1\u73b0\uff0c\u5f3a\u5316\u5b66\u4e60\u5728\u5176\u4e2d\u5360\u636e\u4e86\u975e\u5e38\u91cd\u8981\u7684\u4f4d\u7f6e\uff0c\u4f46LLM\u548cRL\u7684\u7ed3\u5408\u5e76\u4e0d\u7b80\u5355\u3002<\/p>\n\n\n\n<ul>\n<li>\u5f3a\u5316\u5b66\u4e60\u4e2d\u7684Agent\u3001State\u3001Action\u7b49\u8981\u7d20\u5728NLP\u8bed\u5883\u4e2d\u5206\u522b\u6307\u4ec0\u4e48\uff1f<\/li>\n\n\n\n<li>\u8bed\u8a00\u6a21\u578b\u53c8\u662f\u5982\u4f55\u6839\u636eReward\u66f4\u65b0\u7684\uff1f<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"vzRrY\">2.1. RLHF<\/h2>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556585\/1728551092871-55b0602e-0ab4-42c4-b804-714c52b6bb55.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"uf25b038b\">RLHF\u662f\u4e4b\u524d\u6210\u529f\u7684LLM+RL\u7b97\u6cd5\uff0c\u4ed6\u7684Agent\u5c31\u662fLLM\uff0cstate\u5c31\u662f\u5f53\u524d\u7684\u8f93\u51fa\uff0caction\u5c31\u662ftoken\u3002<\/p>\n\n\n\n<p id=\"u9b43eedc\">\u5982\u4e0a\u56fe\uff0cRLHF \u7684\u5b8c\u6574\u5de5\u4f5c\u6d41\u7a0b\u53ef\u4ee5\u5206\u4e3a\u76d1\u7763\u5fae\u8c03\u3001\u5956\u52b1\u6a21\u578b\u8bad\u7ec3\u3001\u5f3a\u5316\u5b66\u4e60\u5fae\u8c03\u4e09\u4e2a\u9636\u6bb5\u3002<\/p>\n\n\n\n<p id=\"u9f04bf7a\">\u5728<a href=\"https:\/\/zhida.zhihu.com\/search?content_id=238709685&amp;content_type=Article&amp;match_order=1&amp;q=RLHF-PPO%E9%98%B6%E6%AE%B5&amp;zhida_source=entity\" target=\"_blank\" rel=\"noreferrer noopener\">RLHF-PPO\u9636\u6bb5<\/a>\uff0c\u4e00\u5171\u6709\u56db\u4e2a\u4e3b\u8981\u6a21\u578b\uff0c\u5206\u522b\u662f\uff1a<\/p>\n\n\n\n<ul>\n<li><strong>Actor Model\uff1a<\/strong>\u8fd9\u5c31\u662f\u6211\u4eec\u60f3\u8981\u8bad\u7ec3\u7684\u76ee\u6807\u8bed\u8a00\u6a21\u578b\uff0c\u4e00\u822c\u7528SFT\u9636\u6bb5\u4ea7\u51fa\u7684\u6a21\u578b\u6765\u5bf9\u5b83\u505a\u521d\u59cb\u5316\u3002<\/li>\n\n\n\n<li><strong>Critic Model\uff1a<\/strong>\u5b83\u7684\u4f5c\u7528\u662f\u9884\u4f30\u671f\u671b\u603b\u6536\u76ca Vt\uff0c\u4e00\u822c\u7528Reward Model\u521d\u59cb\u5316\u3002<\/li>\n\n\n\n<li><a href=\"https:\/\/zhida.zhihu.com\/search?content_id=238709685&amp;content_type=Article&amp;match_order=1&amp;q=Reward+Model&amp;zhida_source=entity\" target=\"_blank\" rel=\"noreferrer noopener\"><strong>Reward Model<\/strong><\/a><strong>\uff1a<\/strong>\u5b83\u7684\u4f5c\u7528\u662f\u8ba1\u7b97\u5373\u65f6\u6536\u76ca Rt\uff0c\u7528Rank Loss\u8bad\u7ec3\uff08\u901a\u8fc7\u6392\u5e8f\u5e8f\u5217\u5b66\u4f1a\u6253\u5206\uff09\u3002<\/li>\n\n\n\n<li><strong>Reference Model\uff1a<\/strong>\u5b83\u7684\u4f5c\u7528\u662f\u5728RLHF\u9636\u6bb5\u7ed9\u8bed\u8a00\u6a21\u578b\u589e\u52a0\u4e00\u4e9b\u201c\u7ea6\u675f\u201d\uff0c\u9632\u6b62\u8bed\u8a00\u6a21\u578b\u8bad\u6b6a\uff08\u671d\u4e0d\u53d7\u63a7\u5236\u7684\u65b9\u5411\u66f4\u65b0\uff0c\u6548\u679c\u53ef\u80fd\u8d8a\u6765\u8d8a\u5dee\uff09\uff0c\u4e00\u822c\u7528SFT\u9636\u6bb5\u4ea7\u51fa\u7684\u6a21\u578b\u6765\u5bf9\u5b83\u505a\u521d\u59cb\u5316\u3002<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728700186032-a44d62ba-f090-460c-aa19-dce64daee2f4.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u72bc3cda\">\u5176\u4e2d:<\/p>\n\n\n\n<ul>\n<li><strong>Actor\/Critic Model<\/strong>\u5728RLHF\u9636\u6bb5\u662f<strong>\u9700\u8981\u8bad\u7ec3<\/strong>\u7684\uff1b\u800c<strong>Reward\/Reference Model<\/strong>\u662f<strong>\u53c2\u6570\u51bb\u7ed3<\/strong>\u7684\u3002<\/li>\n\n\n\n<li>Critic\/Reward\/Reference Model\u5171\u540c\u7ec4\u6210\u4e86\u4e00\u4e2a\u201c\u5956\u52b1-loss\u201d\u8ba1\u7b97\u4f53\u7cfb\uff0c\u6211\u4eec\u7efc\u5408\u5b83\u4eec\u7684\u7ed3\u679c\u8ba1\u7b97loss\uff0c\u7528\u4e8e\u66f4\u65b0Actor\u548cCritic Model<\/li>\n<\/ul>\n\n\n\n<p id=\"ud9524ab3\"><a href=\"https:\/\/zhida.zhihu.com\/search?content_id=238709685&amp;content_type=Article&amp;match_order=1&amp;q=RLHF-PPO%E9%98%B6%E6%AE%B5&amp;zhida_source=entity\" target=\"_blank\" rel=\"noreferrer noopener\">RLHF-PPO<\/a>\u8bad\u7ec3\u8fc7\u7a0b\uff1a<\/p>\n\n\n\n<ul>\n<li>\u7b2c\u4e00\u6b65\uff0c\u6211\u4eec\u51c6\u5907\u4e00\u4e2abatch\u7684prompts\u3002<\/li>\n\n\n\n<li>\u7b2c\u4e8c\u6b65\uff0c\u6211\u4eec\u5c06\u8fd9\u4e2abatch\u7684prompts\u5582\u7ed9Actor\u6a21\u578b\uff0c\u8ba9\u5b83\u751f\u6210\u5bf9\u5e94\u7684responses\u3002<\/li>\n\n\n\n<li>\u7b2c\u4e09\u6b65\uff0c\u6211\u4eec\u628aprompt+responses\u5582\u7ed9\u6211\u4eec\u7684Critic\/Reward\/Reference\u6a21\u578b\uff0c\u8ba9\u5b83\u751f\u6210\u7528\u4e8e\u8ba1\u7b97actor\/critic loss\u7684\u6570\u636e\uff0c\u6309\u7167\u5f3a\u5316\u5b66\u4e60\u7684\u672f\u8bed\uff0c\u6211\u4eec\u79f0\u8fd9\u4e9b\u6570\u636e\u4e3a\u7ecf\u9a8c\uff08experiences\uff09\u3002critic loss\u6211\u4eec\u5c06\u5728\u540e\u6587\u505a\u8be6\u7ec6\u8bb2\u89e3\uff0c\u76ee\u524d\u6211\u4eec\u53ea\u628a\u76ee\u5149\u805a\u7126\u5230actor loss\u4e0a\u3002<\/li>\n\n\n\n<li>\u7b2c\u56db\u6b65\uff0c\u6211\u4eec\u6839\u636e\u8fd9\u4e9b\u7ecf\u9a8c\uff0c\u5b9e\u9645\u8ba1\u7b97\u51faactor\/critic loss\uff0c\u7136\u540e\u66f4\u65b0Actor\u548cCritic\u6a21\u578b\u3002Actor \u7684 loss \u51fd\u6570\u662f\u901a\u8fc7 discount reward \u548c importance ratio \u6765\u8ba1\u7b97\uff0cCritic \u7684 loss \u901a\u8fc7\u5f53\u524d\u7684\u9884\u6d4b\u503c\u548c\u771f\u5b9e\u503c\u52a0\u4e0a\u4e0b\u4e00\u65f6\u523b\u7684\u9884\u6d4b\u503c\u4e4b\u95f4\u7684\u5dee\u503c\u6765\u8ba1\u7b97\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"uOsRN\">2.2. <strong>\u8499\u7279\u5361\u6d1b\u6811\u641c\u7d22(<\/strong>MCTS)<\/h2>\n\n\n\n<p id=\"u82ae0abc\">RLHF\u662f\u5f88\u597d\u7684\u628aRL\u7528\u5728NLP\u4efb\u52a1\u4e2d\u7684\u65b9\u6cd5\uff0co1\u5728\u5b83\u7684\u57fa\u7840\u4e0a\u53c8\u505a\u4e86\u65b0\u7684\u5de5\u4f5c\uff0c\u4e3a\u4e86\u6253\u5f00\u6a21\u578b\u63a8\u7406\u7684\u89e3\u7a7a\u95f4\uff0co1\u5e94\u8be5\u4f7f\u7528\u4e86MCTS\u4e4b\u7c7b\u7684\u6811\u641c\u7d22\u7b97\u6cd5\uff0cMCTS\u662f\u4e00\u79cd\u7528\u4e8e\u89e3\u51b3\u5f3a\u5316\u5b66\u4e60\u95ee\u9898\u7684\u9ad8\u6548\u7684\u6811\u641c\u7d22\u7b97\u6cd5\uff0c\u53ef\u4ee5\u5e73\u8861\u5229\u7528\u548c\u63a2\u7d22\u3002MCTS\u4f7f\u7528\u6a21\u62df\u6765\u6784\u5efa\u4e00\u4e2a MDP \u6811\u3002\u8bc4\u4f30\u72b6\u6001\u5b58\u50a8\u5728\u4e00\u4e2a\u641c\u7d22\u6811\u4e2d\u3002\u8bc4\u4f30\u72b6\u6001\u96c6\u5408\u662f\u901a\u8fc7\u8fed\u4ee3\u4ee5\u4e0b\u56db\u4e2a\u6b65\u9aa4<strong>\u589e\u91cf\u5f0f<\/strong>\u5730\u6784\u5efa\u7684\uff1a<\/p>\n\n\n\n<ul>\n<li><strong>\u9009\u62e9\uff1a<\/strong>\u5728\u6811\u4e2d\u9009\u62e9\u4e00\u4e2a<strong>\u672a\u5b8c\u5168\u6269\u5c55<\/strong>\u7684\u5355\u7ed3\u70b9\u3002\u8fd9\u610f\u5473\u7740\u5b83\u81f3\u5c11\u6709\u4e00\u4e2a\u5b50\u7ed3\u70b9\u5c1a\u672a\u88ab\u63a2\u7d22\u3002<\/li>\n\n\n\n<li><strong>\u6269\u5c55\uff1a<\/strong>\u901a\u8fc7\u4ece\u8be5\u8282\u70b9\u5e94\u7528\u4e00\u4e2a\u53ef\u7528\u7684\u884c\u52a8\uff08\u7531 MDP \u5b9a\u4e49\uff09\u6765\u6269\u5c55\u8be5\u7ed3\u70b9\u3002<\/li>\n\n\n\n<li><strong>\u6a21\u62df\uff08Rollout\uff09\uff1a<\/strong>\u4ece\u4e00\u4e2a\u65b0\u7ed3\u70b9\u4e2d\uff0c\u5bf9 MDP \u8fdb\u884c\u4e00\u6b21Rollout\u3002\u5982\u679c\u641c\u7d22\u6811\u662f\u6709\u9650\u7684\uff0c\u53ef\u4ee5\u8fdb\u884c\u5b8c\u6574\u7684\u968f\u673a\u6a21\u62df\uff0c\u4f7f\u5176\u8fbe\u5230\u7ec8\u6b62\u72b6\u6001\u3002\u4f46\u5982\u679c\u641c\u7d22\u6811\u662f\u65e0\u9650\u7684\uff0c\u53ef\u4ee5\u53ea\u5728\u5176\u4e2d\u6267\u884c\u4e00\u6bb5\u65f6\u95f4\uff0c\u7136\u540e\u4f30\u8ba1\u7ed3\u679c\u3002<\/li>\n\n\n\n<li><strong>\u53cd\u5411\u4f20\u64ad\uff1a<\/strong>\u6700\u540e\uff0c\u5c06\u7ed3\u70b9\u7684\u4ef7\u503c<strong>\u53cd\u5411\u4f20\u64ad<\/strong>\u5230\u6839\u7ed3\u70b9\uff0c\u4f7f\u7528\u671f\u671b\u4ef7\u503c\u66f4\u65b0\u9014\u4e2d\u7ecf\u8fc7\u7684\u6bcf\u4e2a\u7956\u5148\u7ed3\u70b9\u7684\u4ef7\u503c\u3002<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"NlgF6\">2.2.1. AlphaGo Zero\u4e2d\u7684MCTS<\/h3>\n\n\n\n<p id=\"u47c51549\">AlphaZero\u7ed3\u5408\u4e86MCTS\u548cRL\uff0cMCTS\u662f\u4e3b\u4f53\uff0cRL\u8d77\u5230\u4e86\u52a0\u901f\u641c\u7d22\u901f\u5ea6\u7684\u4f5c\u7528\u3002\u5728Self Play\u8fc7\u7a0b\u4e2d\uff0c\u5bf9\u4e8e\u67d0\u4e2aAI\u68cb\u624b\uff0c\u5b83\u4f1a\u7528MCTS\u641c\u7d22\uff0c\u5bf9\u5f53\u524d\u72b6\u6001 S \u4e0b\u5404\u4e2a\u53ef\u80fd\u843d\u5b50\uff08Action\uff09\u90fd\u53bb\u641c\u4e00\u4e0b\uff0c\u6bcf\u4e2a\u4f4d\u7f6e\u7ecf\u8fc7\u641c\u7d22\u4e4b\u540e\uff0c\u80fd\u83b7\u5f97\u6bcf\u4e2a\u843d\u5b50\u4f4d\u7f6e\u8d62\u68cb\u7684<a href=\"https:\/\/zhida.zhihu.com\/search?content_id=248563321&amp;content_type=Article&amp;match_order=1&amp;q=%E6%A6%82%E7%8E%87%E5%88%86%E5%B8%83&amp;zhida_source=entity\" target=\"_blank\" rel=\"noreferrer noopener\">\u6982\u7387\u5206\u5e03<\/a> \u03c0 \uff0c\u4ece\u4e2d\u9009\u62e9\u6982\u7387\u6700\u5927\u7684\u4f4d\u7f6e\u6765\u843d\u5b50\uff0c\u4e4b\u540e\u53e6\u4e00\u4e2aAI\u68cb\u624b\u4e5f\u91c7\u7528\u7c7b\u4f3c\u7684\u601d\u8def\u53bb\u843d\u5b50\u2026\u2026\u8fd9\u4e48\u4e00\u6765\u4e00\u56de\u76f4\u5230\u5206\u51fa\u80dc\u8d1f\uff08 z \u6307\u51fa\u8c01\u662f\u80dc\u8005,Reward\u4fe1\u53f7\uff09\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556585\/1728551468686-dc42679c-aa6d-4931-978d-16c9c8729564.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"ucd491bf2\">\u3010a\u56fe\u3011\u8868\u793a\u81ea\u5bf9\u5f08\u8fc7\u7a0b <img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/aade72bc495f8da3549fd4f0c9062216.svg\">\u3002\u5728\u6bcf\u4e00\u4e2a\u4f4d\u7f6e\uff0c\u4f7f\u7528\u6700\u65b0\u7684\u795e\u7ecf\u7f51\u7edc\u6267\u884c\u4e00\u6b21MCTS\u641c\u7d22\u3002\u6839\u636e\u641c\u7d22\u5f97\u51fa\u7684\u6982\u7387\u8fdb\u884c\u843d\u5b50\u3002\u7ec8\u5c40\u65f6\u6839\u636e\u56f4\u68cb\u89c4\u5219\u8ba1\u7b97\u80dc\u8005\uff0c<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/75b0254963d0e762842382beb51f18fb.svg\">\u662f\u6bcf\u4e00\u6b65\u65f6\u6267\u884cMCTS\u641c\u7d22\u5f97\u51fa\u7684\u7ed3\u679c\uff08\u67f1\u72b6\u56fe\u8868\u793a\u6982\u7387\u7684\u9ad8\u4f4e\uff09<\/p>\n\n\n\n<p id=\"ub2e943b4\">\u3010b\u56fe\u3011\u8868\u793a\u66f4\u65b0\u795e\u7ecf\u7f51\u7edc\u53c2\u6570\u8fc7\u7a0b\u3002\u4f7f\u7528\u539f\u59cb\u843d\u5b50\u72b6\u6001<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/24c631e57687c69f2ccf6aa7b8f3f430.svg\">\u4f5c\u4e3a\u8f93\u5165\uff0c\u5f97\u5230\u6b64\u68cb\u76d8\u72b6\u6001<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/24c631e57687c69f2ccf6aa7b8f3f430.svg\">\u4e0b\u4e0b\u4e00\u6b65\u6240\u6709\u53ef\u80fd\u843d\u5b50\u4f4d\u7f6e\u7684\u6982\u7387\u5206\u5e03<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/3dc94096f0c327b693e084140d512d2f.svg\">\u548c\u5f53\u524d\u72b6\u6001<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/24c631e57687c69f2ccf6aa7b8f3f430.svg\">\u4e0b\u9009\u624b\u7684\u8d62\u68cb\u8bc4\u4f30\u503c<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/2f78b28373820ed6ae1ed827af3b5716.svg\">\u4ee5\u6700\u5927\u5316<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/3dc94096f0c327b693e084140d512d2f.svg\">\u4e0e<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/22350163c33aec17bae12ff3ca1f72f1.svg\">\u76f8\u4f3c\u5ea6\u548c\u6700\u5c0f\u5316\u9884\u6d4b\u7684\u80dc\u8005 <img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/2f78b28373820ed6ae1ed827af3b5716.svg\">\u548c\u5c40\u7ec8\u80dc\u8005\u7684\u8bef\u5dee\u6765\u66f4\u65b0\u795e\u7ecf\u7f51\u7edc\u53c2\u6570<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/__latex\/125a97181496f444fc00b132f2b869f1.svg\">, \u4e0b\u4e00\u8f6e\u8fed\u4ee3\u4e2d\u4f7f\u7528\u65b0\u795e\u7ecf\u7f51\u7edc\u8fdb\u884c\u81ea\u6211\u5bf9\u5f08\u3002<\/p>\n\n\n\n<p id=\"ue76fe9c3\"><strong>\u6811\u641c\u7d22\u6d41\u7a0b<\/strong><\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556585\/1728551491147-ae9f3c4b-717d-4115-955f-a96ad1c5f884.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"uf2f38f0a\">\u3010a\u56fe\u3011\u8868\u793a\u6a21\u62df\u8fc7\u7a0b\u4e2d\u904d\u5386\u65f6\u9009 Q+U \u66f4\u5927\u7684\u4f5c\u4e3a\u843d\u5b50\u70b9<\/p>\n\n\n\n<p id=\"u84106abf\">\u3010b\u56fe\u3011\u53f6\u5b50\u8282\u70b9\u7684\u6269\u5c55\u548c\u8bc4\u4f30\u3002\u4f7f\u7528\u795e\u7ecf\u7f51\u7edc\u5bf9\u5f53\u524d\u72b6\u6001\u8fdb\u884c\u8bc4\u4f30\u3002<\/p>\n\n\n\n<p id=\"uee1f0fb9\">\u3010c\u56fe\u3011\u66f4\u65b0\u884c\u52a8\u4ef7\u503c Q \u7b49\u4e8e\u6b64\u65f6\u6839\u72b6\u6001 s \u6240\u6709\u5b50\u6811\u8bc4\u4f30\u503c V \u7684\u5e73\u5747\u503c<\/p>\n\n\n\n<p id=\"uaf442103\">\u3010d\u56fe\u3011\u5f53MCTS\u641c\u7d22\u5b8c\u6210\u540e\uff0c\u8fd4\u56de\u8fd9\u4e2a\u72b6\u6001 s \u4e0b\u6bcf\u4e00\u4e2a\u4f4d\u7f6e\u7684\u843d\u5b50\u6982\u7387 \u03c0<\/p>\n\n\n\n<p id=\"ufa3de48a\"><strong>\u603b\u7ed3\uff1aMCTS\u5c31\u662f\u4e00\u4e2a\u9ad8\u6548\u7684\u5e73\u8861\u5229\u7528\u548c\u63a2\u7d22\uff08\u526a\u679d\uff09\u7684\u6811\u641c\u7d22\u7b97\u6cd5<\/strong><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"Oi90X\">2.2.2. NLP\u4efb\u52a1\u4e2d\u7684MCTS<\/h3>\n\n\n\n<p id=\"u3d108698\">\u53c2\u8003AlphaGo Zero\u7684\u505a\u6cd5\uff0c\u5047\u8bbeAction\u662f\u6bcf\u4e00\u4e2a\u53e5\u5b50\uff08Action\u53ef\u4ee5\u6709\u4e0d\u540c\u7684\u5c42\u7ea7\uff0c\u540e\u9762\u4f1a\u8ba8\u8bba\uff0c\u8fd9\u91cc\u53ea\u8bb2NLP\u4efb\u52a1\u4e2dMCTS\u7684\u505a\u6cd5\uff09\u3002\u5728\u641c\u7d22\u65f6\u4f1a\u7528\u7b56\u7565\u7f51\u7edc P \u548c\u4ef7\u503c\u7f51\u7edc V \u6765\u5feb\u901f\u5bfb\u627e\u6700\u4f18\u641c\u7d22\u8def\u5f84\uff0c\u9009\u62e9\u6982\u7387\u6700\u5927\u7684Action\u4f5c\u4e3a\u5f53\u524d\u72b6\u6001\u4e0b\u7684\u884c\u4e3a\uff0c\u7531\u6a21\u578b\u751f\u6210\u9488\u5bf9\u8fd9\u4e2a\u884c\u4e3a\u4e0bresponse\u3002\u5c06response\u5e76\u5165\u7528\u6237\u95ee\u9898\uff0c\u5f62\u6210\u65b0\u7684State\uff0c\u4f9d\u6b21\u5f80\u540e\u8d70\uff0c\u76f4\u5230\u4ea7\u751f\u95ee\u9898\u7684\u7b54\u6848\u3002<img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556585\/1728836398359-15e66ae6-38b5-47a7-a3a4-33eedd223a90.png\" width=\"1542.7272392895604\"><\/p>\n\n\n\n<p id=\"u06e743ee\"><strong>\u5177\u4f53\u641c\u7d22\u8fc7\u7a0b\uff1a<\/strong><\/p>\n\n\n\n<ol>\n<li>\u4ece\u72b6\u6001 S \u51fa\u53d1\uff0c\u641c\u7d22\u67d0\u4e2a\u201c\u53e5\u5b50\u201d\u6307\u5411\u6b63\u786e\u7b54\u6848\u7684\u6982\u7387\u65f6\uff0c\u4ee5 max(Q+U) \u7684\u65b9\u5f0f\u5bfb\u627e\u6700\u4f18\u4e0b\u4e00\u72b6\u6001 S\u2032 \uff0c\u800c Q \u51fd\u6570\u4e0e\u4ef7\u503c\u7f51\u7edc V(S) \u6b63\u76f8\u5173\uff0c U \u51fd\u6570\u4e0e\u7b56\u7565\u7f51\u7edc P(S,A) \u6b63\u76f8\u5173\uff0c\u6240\u4ee5 max(Q+U) \u7684\u542b\u4e49\u662f\u901a\u8fc7\u4ef7\u503c\u7f51\u7edc\u548c\u7b56\u7565\u7f51\u7edc\u7684\u6307\u5f15\uff0c\u6765\u5bfb\u627e\u9ad8\u8d28\u91cf\u7684\u641c\u7d22\u8def\u5f84<\/li>\n\n\n\n<li>\u5f53\u641c\u7d22\u5230<a href=\"https:\/\/zhida.zhihu.com\/search?content_id=248563321&amp;content_type=Article&amp;match_order=1&amp;q=%E5%8F%B6%E7%BB%93%E7%82%B9&amp;zhida_source=entity\" target=\"_blank\" rel=\"noreferrer noopener\">\u53f6\u7ed3\u70b9<\/a>\u7684\u65f6\u5019\uff0c\u4f1a\u8fdb\u884c\u8282\u70b9\u6269\u5c55\uff0c\u5e76\u7528\u7b56\u7565\u7f51\u7edc\u548c\u4ef7\u503c\u7f51\u7edc\u4f30\u7b97\u521d\u59cb\u5316<a href=\"https:\/\/zhida.zhihu.com\/search?content_id=248563321&amp;content_type=Article&amp;match_order=1&amp;q=%E7%9B%B8%E5%85%B3%E6%90%9C%E7%B4%A2&amp;zhida_source=entity\" target=\"_blank\" rel=\"noreferrer noopener\">\u76f8\u5173\u641c\u7d22<\/a>\u53c2\u6570\uff0c\u4e4b\u540e\u7531\u4f4e\u5411\u4e0a\u66f4\u65b0\u6700\u4f18\u8def\u5f84\u4e0a\u6240\u6709\u72b6\u6001\u5bf9\u5e94\u7684 Q \u51fd\u6570\u3002<\/li>\n\n\n\n<li>O1\u641c\u7d22\u65f6\u4e0e\u4e0b\u68cb\u4e0d\u540c\u7684\u5730\u65b9\u5728\u4e8e\uff1a\u5982\u679c\u8981\u5f80\u4e0b\u4e00\u72b6\u6001\u8f6c\u79fb\uff0c\u8fd8\u9700\u8981\u6839\u636e\u5f53\u524d\u9009\u5230\u7684\u884c\u4e3a\uff0c\u4ea7\u751f\u5bf9\u5e94\u7684Hidden CoT tokens\uff0c\u8fd9\u4e2a\u6b65\u9aa4\u53ef\u7531Best-of-N Sampling\u7b56\u7565\u6765\u5b8c\u6210\u3002<\/li>\n<\/ol>\n\n\n\n<p id=\"ub3e1c92c\"><strong>\u53c2\u6570\u66f4\u65b0\u8fc7\u7a0b\uff1a<\/strong><\/p>\n\n\n\n<ol>\n<li>\u5f53\u6bcf\u4e2a\u5019\u9009\u7684\u201c\u53e5\u5b50\u201d\u7ecf\u8fc7\u4e00\u8f6e\u641c\u7d22\u540e\u4f1a\u5f97\u5230\u6240\u6709\u884c\u4e3a\u7684\u5206\u5e03\u6982\u7387 \u03c0 \uff0c\u7528\u4e8e\u66f4\u65b0\u7b56\u7565\u7f51\u7edc P(S,A) \u3002<\/li>\n\n\n\n<li>\u5bf9\u4e8e\u641c\u7d22\u8fc7\u7a0b\u6bcf\u4e2a\u88ab\u9009\u4e2d\u201c\u53e5\u5b50\u201d\u901a\u8fc7Best-of-N Sampling\u5f97\u5230\u7684\u5bf9\u5e94Hidden CoT tokens\u5e8f\u5217\uff0c\u53ef\u4ee5\u62ff\u5230PRM\u8d4b\u4e88\u8fd9\u4e2atokens\u5e8f\u5217\u5bf9\u5e94\u7684Process Reward\u5206\u6570\uff09\uff0c\u7528\u6765\u66f4\u65b0\u4ef7\u503c\u7f51\u7edc V(S) \u3002<\/li>\n<\/ol>\n\n\n\n<h1 class=\"wp-block-heading\" id=\"B0iNG\">3. o1\uff1a\u6280\u672f\u8def\u7ebf\u731c\u60f3\u4e0e\u590d\u73b0<\/h1>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"beqEE\">3.1. o1\u7684\u5b9e\u73b0\u731c\u60f3<\/h2>\n\n\n\n<p id=\"ueb384c83\">\u672c\u8282\u4e3b\u8981\u662f\u5bf9o1\u7684\u5b9e\u73b0\u505a\u4e00\u4e9b\u731c\u6d4b\uff0c\u731c\u6d4b\u7684\u4fe1\u606f\u6765\u6e90\u4e3b\u8981\u662f\u4e00\u4e9bOpenAI\u7814\u7a76\u5458\u4ee5\u53ca\u4e00\u4e9b\u5206\u6790\u3002<\/p>\n\n\n\n<p id=\"u9e48f0b3\">\u6b63\u5982\u524d\u6587\u63d0\u5230\u7684\uff0c\u5c0f\u6a21\u578b+\u65e0\u9650\u957f\u7684CoT\u53ef\u4ee5\u89e3\u51b3\u4e16\u754c\u4e0a\u4efb\u4f55\u95ee\u9898[11][12]\u3002\u56e0\u6b64\uff0co1\u4e00\u5b9a\u662f\u4e00\u4e2a\u901a\u8fc7\u4f7f\u7528\u8d85\u957fCoT\u6765\u5f3a\u5316\u6a21\u578b\u8f93\u51fa\u6b63\u786e\u7b54\u6848\u80fd\u529b\u7684\u6a21\u578b\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728702331291-25e2f56f-a2d5-4b61-ab28-39100441f139.png?x-oss-process=image%2Fformat%2Cwebp\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u9ca36946\">\u90a3\u63a5\u4e0b\u6765\u5c06\u4ee5\u4ee5\u4e0b\u51e0\u4e2a\u90e8\u5206\u6765\u63cf\u8ff0\u3002<\/p>\n\n\n\n<p id=\"u0bf32a20\">\u9996\u5148\uff0co1\u662f\u4e00\u4e2a\u6a21\u578b\u5e76\u975e\u4e00\u4e2a\u7cfb\u7edf\uff0c\u56e0\u6b64o1\u8fd9\u7c7b\u6a21\u578b\u9700\u8981\u5f88\u5f3a\u7684\u63a8\u7406\u80fd\u529b\uff0c\u5e76\u4e14\u9700\u8981\u8f83\u5f3a\u7684\u5bf9\u8bdd\u80fd\u529b\u6765\u505aself-critic\u8fd9\u7c7b\u52a8\u4f5c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728711127711-6d243ab3-7827-4a6f-9fef-175b239e1cb9.png?x-oss-process=image%2Fformat%2Cwebp\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u54358946\">\u8003\u8651\u5230o1\u5728System card\u4e2d\u4e00\u76f4\u63d0\u5230\u7684o1-mini\u7684\u4e16\u754c\u77e5\u8bc6\u5f88\u5dee[20]\uff0c\u731c\u6d4bo1\u662f\u4e00\u4e2afrom scratch\u8bad\u7ec3\u7684\u5c0f\u6a21\u578b\u3002\u56e0\u6b64\u5c06\u6574\u4f53\u7684\u6280\u672f\u6808\u62c6\u89e3\u4e3a\uff1a<\/p>\n\n\n\n<ul>\n<li>pre-training\u9636\u6bb5\uff1a<\/li>\n\n\n\n<li>\u5f3a\u5316\u63a8\u7406\u80fd\u529b<\/li>\n\n\n\n<li>post-training\u9636\u6bb5\uff1a<\/li>\n\n\n\n<li>sft:<\/li>\n\n\n\n<li>\u7ee7\u7eed\u5f3a\u5316\u63a8\u7406\u80fd\u529b<\/li>\n\n\n\n<li>RL\uff1a<\/li>\n<\/ul>\n\n\n\n<ul>\n<li>\u63a2\u7d22\u5982\u4f55\u6269\u5927\u6a21\u578b\u7684\u89e3\u7a7a\u95f4<\/li>\n\n\n\n<li>\u8bad\u7ec3\u4e00\u4e2a\u66f4\u4f18\u79c0\u7684reward model<\/li>\n\n\n\n<li>inference\u9636\u6bb5\uff1a<\/li>\n\n\n\n<li>Best of N<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556585\/1729492083121-b10321ce-2236-404b-82ad-08c76d3ebcb4.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"KMKIR\">3.1.1. Pre-Train\u9636\u6bb5<\/h3>\n\n\n\n<p id=\"u2898b878\">\u8fd9\u4e00\u90e8\u5206\u56e0\u4e3a\u540e\u9762RL\u4ecd\u7136\u6709\u8f83\u5927\u7684\u81ea\u7531\u5ea6\uff0c\u56e0\u6b64\u57fa\u6a21\u76f8\u5bf9\u91cd\u8981\u6027\u5e94\u8be5\u6bd4\u8f83\u4f4e\u3002\u56e0\u4e3a\u540e\u7eed\u4f7f\u7528\u4e86\u6811\u641c\u7d22\uff0c\u751a\u81f3\u53ef\u4ee5\u662f\u4e0d\u662f\u90a3\u4e48\u5f3a\u7684\u6a21\u578b\u2014\u2014\u56e0\u4e3a\u89e3\u7a7a\u95f4\u8db3\u591f\u5927\u603b\u6709\u4e00\u6761\u8f93\u51fa\u8def\u5f84\u53ef\u4ee5\u8fbe\u5230\u6b63\u786e\u89e3\u3002\u4e0d\u8fc7\u8f83\u597d\u7684\u9884\u8bad\u7ec3\u6a21\u578b\u53ef\u4ee5\u6709\u6548\u964d\u4f4e\u540e\u7eedRL\u7684\u8ba1\u7b97\u6210\u672c\u3002<\/p>\n\n\n\n<p id=\"udbcbc7f0\">\u4f46\u662f\u9884\u8bad\u7ec3\u9636\u6bb5\u5e94\u8be5\u9700\u8981\u63d0\u5347\u6a21\u578b\u7684\u63a8\u7406\u80fd\u529b\uff0c\u76ee\u524d\u5df2\u77e5\u7684\u4e00\u4e9b\u7b56\u7565\u662f\u63d0\u9ad8\u4f8b\u5982Code\u3001Math\u3001\u8bba\u6587\u7b49\u6570\u636e\u5360\u6bd4\uff0c\u51cf\u5c11\u4f8b\u5982cc\u7b49\u6570\u636e\uff0c\u63d0\u5347\u6a21\u578b\u7684\u8bad\u7ec3ROI\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"S3Xof\">3.1.2. SFT\u9636\u6bb5<\/h3>\n\n\n\n<p id=\"ue02266cf\">\u8fd9\u4e00\u9636\u6bb5\u548cpre-training\u9636\u6bb5\u7684\u76ee\u6807\u4e00\u81f4\uff0c\u5c31\u662f\u5f3a\u5316CoT\u3001Chat\u7b49\u6570\u636e\u5360\u6bd4\uff0c\u589e\u5f3a\u6a21\u578b\u957fCoT\u8f93\u51fa\u80fd\u529b\u548c\u5bf9\u8bdd\u80fd\u529b\u3002<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"nWamj\">3.1.3. \u5f3a\u5316\u5b66\u4e60\u9636\u6bb5<\/h3>\n\n\n\n<p id=\"uc8dda0a6\">OpenAI\u7684\u5f88\u591a\u5de5\u4f5c\u90fd\u505a\u5728\u5f3a\u5316\u4e0a\uff0cOpenAI\u7684\u5f3a\u9879\u4e00\u76f4\u90fd\u662fRL\uff0cOpenAI\u81ea\u5df1\u5bf9\u5916\u8bf4RL\u76f8\u6bd4Pretrain\u9636\u6bb5\u7684\u63d0\u5347\uff0c\u7b97\u529b\u53ef\u4ee5\u8282\u7ea630\u500d\u3002<\/p>\n\n\n\n<p id=\"u47e992fb\">o1\u7684\u8bbf\u8c08\u53ef\u4ee5\u603b\u7ed3\u51fa\u4ee5\u4e0b\u51e0\u70b9\u6bd4\u8f83\u6709\u6548\u7684\uff1a<\/p>\n\n\n\n<ul>\n<li>RL\u8f93\u51fa\u7684CoT\u6bd4\u4eba\u7684\u66f4\u597d\uff08\u6bd4\u5982\u4eba\u7c7b\u603b\u662f\u66f4\u559c\u6b22\u597d\u7406\u89e3\u800c\u4e0d\u662f\u903b\u8f91\u4e25\u8c28\u7684\u5185\u5bb9\uff09\u3002<\/li>\n\n\n\n<li>\u867d\u7136\u6709\u4e00\u4e9b\u6311\u6218\uff08\u6bd4\u5982\u5982\u4f55\u8bbe\u8ba1reward\u7b49\uff09\uff0c\u4f46\u662fRL\u662f\u53ef\u4ee5\u4e00\u76f4\u8d70\u4e0b\u53bb\u7684\u4e00\u6761\u8def\u7ebf\u3002<\/li>\n\n\n\n<li>CoT + self-critic \u53ef\u4ee5\u89e3\u51b3\u4e16\u754c\u4e0a\u4efb\u4f55\u7684\u95ee\u9898\u3002<\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728702747368-0722a318-6fa6-4c55-81cb-8e469aabec7f.png?x-oss-process=image%2Fformat%2Cwebp%2Fresize%2Cw_1105%2Climit_0?x-oss-process=image%2Fcrop%2Cx_0%2Cy_0%2Cw_1105%2Ch_800\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u67e3be94\">\u5177\u4f53\u6765\u8bf4\uff0co1\u76f8\u6bd4\u5176\u4ed6\u6a21\u578b\u6709\u5176\u4ed6\u51e0\u4e2a\u6311\u6218\uff1a<\/p>\n\n\n\n<ul>\n<li>\u5982\u4f55\u6269\u5927\u6a21\u578b\u7684\u89e3\u7a7a\u95f4\uff1f\u6362\u53e5\u8bdd\u8bf4\u5982\u4f55\u8ba9\u6a21\u578b\u8f93\u51fa\u66f4\u957f\u3001\u66f4\u4f18\u79c0\u7684CoT\uff1f<\/li>\n\n\n\n<li>\u5982\u4f55\u786e\u5b9aCoT\u8d28\u91cf\uff1f<\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"YDvHT\">3.1.3.1. \u5982\u4f55\u6269\u5927\u89e3\u7a7a\u95f4<\/h4>\n\n\n\n<p id=\"ue7eda719\">\u6a21\u578b\u9700\u8981\u6269\u5927\u5e38\u89c4\u7684\u89e3\u7801\u7a7a\u95f4\uff0c\u4ee5\u6b64\u6765\u83b7\u5f97\u66f4\u52a0\u4f18\u79c0\u7684\u957fCoT\u548c\u7ed3\u679c\u3002<\/p>\n\n\n\n<h5 class=\"wp-block-heading\" id=\"GSkd2\">3.1.3.1.1. MCTS<\/h5>\n\n\n\n<p id=\"u86afea95\">MCTS\u662f\u76ee\u524d\u8f83\u591a\u5de5\u4f5c\u4f7f\u7528\u7684\u4e00\u4e2a\u6269\u5927\u89e3\u7a7a\u95f4\u7684\u7b56\u7565[13,14,16,24-29]\uff0c\u4e5f\u662fAlphaGo Zero\u4f7f\u7528\u7684\u65b9\u6848\u3002<\/p>\n\n\n\n<p id=\"u7d7b8867\">\u5728\u4e0b\u6587\u4e2d\uff0c\u9996\u5148\u5b9a\u4e49Action\u4e3a\u6bcf\u4e00\u6b21\u641c\u7d22\u4e2d\u4e0b\u4e00\u4e2a\u52a8\u4f5c\uff0c\u6bd4\u5982\u5728\u56f4\u68cb\u4e2d\u4e3a\u843d\u5b50\u7684\u5730\u70b9\u3002<\/p>\n\n\n\n<h6 class=\"wp-block-heading\" id=\"CYe6U\">3.1.3.1.1.1. <strong>\u5728\u89e3\u7a7a\u95f4\u4e2d\u641c\u7d22CoT\u8def\u5f84<\/strong><\/h6>\n\n\n\n<p id=\"uc4ebc442\">\u4eff\u7167AlphaZero\uff0c\u6784\u9020MCTS\u6811\uff0c\u5bf9\u4e8e\u6bcf\u4e2aAction\u8fdb\u884c\u641c\u7d22\u3002<\/p>\n\n\n\n<p id=\"u3e815162\">\u7ecf\u5386\u8fc7\u8fd9\u4e9b\u8282\u70b9\u7684\u641c\u7d22\u8def\u5f84\u66f4\u5bb9\u6613\u8fbe\u5230\u6b63\u786e\u7684\u7ed3\u679c\uff0c\u56e0\u6b64\u5728\u968f\u540e\u7684\u8bad\u7ec3\u4e2d\u88ab\u5f3a\u5316\u3002\u63a8\u7406\u8fc7\u7a0b\u4e2d\u4e5f\u5c31\u4f9d\u7136\u800c\u7136\u7684\u4f1a\u8f93\u51fa\u8fd9\u4e9b\u4fe1\u606f\u83b7\u5f97\u66f4\u4f18\u79c0\u7684\u7ed3\u679c\u3002<\/p>\n\n\n\n<h6 class=\"wp-block-heading\" id=\"R4Ziq\">3.1.3.1.1.2. o1\u7684Action<\/h6>\n\n\n\n<p id=\"u875493d6\"><strong>1. Thought\u4f5c\u4e3aAction<\/strong><\/p>\n\n\n\n<p id=\"ub1532cb8\">\u8fd9\u662f\u6700\u81ea\u7136\u7684\u60f3\u6cd5\uff0co1\u7684Hidden CoT\u4ea7\u751f\u8fc7\u7a0b\uff0c\u672c\u8d28\u4e0a\u662f\u5728\u8ba9\u673a\u5668\u6a21\u4eff\u4eba\u5728\u89e3\u51b3\u590d\u6742\u95ee\u9898\u4ea7\u751f\u7684\u601d\u8003\u8fc7\u7a0b\uff0c\u800c\u4eba\u5728\u601d\u8003\u590d\u6742\u95ee\u9898\u65f6\uff0c\u6709\u6bd4\u8f83\u56fa\u5b9a\u4e14\u6570\u91cf\u5e76\u4e0d\u592a\u591a\u7684\u201c\u601d\u8003\u6a21\u5f0f\u201d\u6216\u8005\u53ef\u4ee5\u53eb\u201c\u601d\u8003\u56e0\u5b50\u201d\u3002<\/p>\n\n\n\n<p id=\"u6f28d8df\">\u6bd4\u5982\u62ff\u5230\u4e00\u4e2a\u590d\u6742\u95ee\u9898\uff0c\u6211\u4eec\u4e00\u822c\u4f1a\u9996\u5148\u660e\u786e\u8fd9\u4e2a\u95ee\u9898\u7684\u76ee\u6807\u662f\u4ec0\u4e48\uff0c\u7136\u540e\u628a\u590d\u6742\u95ee\u9898\u62c6\u89e3\u6210\u51e0\u4e2a\u73af\u8282\u6216\u8005\u6b65\u9aa4\uff0c\u4e3a\u4e86\u5f97\u5230\u67d0\u4e00\u4e2a\u5177\u4f53\u6b65\u9aa4\u7684\u89e3\u6cd5\uff0c\u53ef\u80fd\u4f1a\u63d0\u51fa\u4e00\u4e2a\u5047\u8bbe\uff0c\u7136\u540e\u9a8c\u8bc1\u8fd9\u4e2a\u5047\u8bbe\u662f\u5426\u6210\u7acb\uff0c\u5982\u679c\u4e0d\u6210\u7acb\uff0c\u90a3\u4e48\u7ee7\u7eed\u63d0\u51fa\u65b0\u7684\u5047\u8bbe\uff0c\u76f4\u5230\u89e3\u51b3\u8fd9\u4e2a\u5b50\u95ee\u9898\u2026..\u6211\u4eec\u4e5f\u53ef\u80fd\u5728\u8fc7\u7a0b\u4e2d\u4f1a\u8fdb\u884c\u9a8c\u7b97\u5e76\u53d1\u73b0\u67d0\u4e9b\u4e2d\u95f4\u73af\u8282\u51fa\u73b0\u9519\u8bef\uff0c\u5e76\u628a\u9519\u8bef\u4fee\u6b63\u8fc7\u6765\u3002<\/p>\n\n\n\n<p id=\"uefc9acfd\">\u6bd4\u5982\uff1a\u201c\u62c6\u89e3\u95ee\u9898\u201d\u3001\u201c\u590d\u8ff0\u76ee\u6807\u201d\u3001\u201c\u68c0\u67e5\u7ed3\u679c\u201d\u3001\u201c\u4fee\u6b63\u9519\u8bef\u201d\u3001\u201c\u63d0\u51fa\u5047\u8bbe\u201d\u7b49\u7b49\u3002\u800c\u9488\u5bf9\u6bcf\u4e2a\u5177\u4f53\u7684\u201c\u601d\u8003\u56e0\u5b50\u201d\uff0c\u53ef\u4ee5\u4ea7\u751f\u7b26\u5408\u5bf9\u5e94\u5206\u5e03\u6982\u7387\u7684Token\u7247\u6bb5\uff0c\u4f8b\u5982self-critic\u53ef\u80fd\u5c31\u662f\u68c0\u67e5\u7ed3\u679c\u8fd9\u4e2aaction\u5bf9\u5e94\u7684\u6a21\u578b\u8f93\u51fa\uff08\u5176\u5b9e\u4e0a\u6587\u7684criticGPT\u4e3b\u52a8\u62fc\u63a5&#8220;`\uff0c\u5176\u5b9e\u4e5f\u662f\u4eba\u5de5\u9009\u62e9action\u7684\u4e00\u79cd\uff09\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556585\/1728836736435-598d8b32-aeef-4ee4-832f-bc4da9e3dc09.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"ud104e483\">\u8fd9\u79cd\u5c42\u7ea7\u7684COT\u7ed3\u6784\uff0c\u80fd\u4f53\u73b0\u51faRL\u548cLLM\u7684\u4f18\u52bf\u7ed3\u5408\uff0c\u79bb\u6563\u884c\u4e3a\u7a7a\u95f4\u6bd4\u5982\u4f30\u7b97\u7ed9\u5b9a\u72b6\u6001S\u91c7\u53d6\u4f55\u79cd\u884c\u4e3a\uff0c\u5373\u51fd\u6570Q(S,A)\u7684\u4f30\u7b97\uff0c\u8fd9\u662fRL\u64c5\u957f\u505a\u7684\u4e8b\u60c5\uff0c\u800c\u601d\u8003\u56e0\u5b50\u6807\u7b7e\u4e2d\u7684Token\u751f\u6210\u5219\u662fLLM\u64c5\u957f\u7684\u4e8b\u60c5\uff0cLLM\u53ef\u4ee5\u6839\u636e\u5bf9\u5e94\u201c\u601d\u8003\u56e0\u5b50\u201d\u7684\u7c7b\u578b\uff0c\u5b66\u4e60\u8c03\u6574\u56e0\u5b50\u6807\u7b7e\u5185\u90e8Token\u7684\u751f\u6210\u6982\u7387\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" id=\"s7Sa5\"\/>\n\n\n\n<p id=\"u15dad876\"><strong>2. Token\u4f5c\u4e3aAction<\/strong><\/p>\n\n\n\n<p id=\"ufb39bb1b\">\u867d\u7136thought\u662f\u4e00\u4e2a\u66f4\u7b80\u5355\u66f4\u76f4\u89c2\u7684\u65b9\u6848\uff0c\u4f46\u662f\u53ef\u80fdOpenAI\u53ef\u80fd\u7528\u7684\u5e76\u4e0d\u662f\u8fd9\u7c7b\u65b9\u6848\uff0c\u7406\u7531\u5982\u4e0b\uff1a<\/p>\n\n\n\n<ul>\n<li>Hyung Won Chung\u6700\u8fd1\u7684\u6f14\u8bb2\u8bf4\uff1a\u4e0d\u5e94\u8be5\u6559\u673a\u5668\u5982\u4f55\u53bb\u601d\u8003\u800c\u662f\u53bb\u6fc0\u53d1\u4ed6\u4eec\u5b66\u4e60\u5982\u4f55\u53bb\u601d\u8003\u3002<\/li>\n\n\n\n<li>o1\u5f00\u53d1\u4eba\u5458\u91c7\u8bbf\u89c6\u9891\u4e2d\u63d0\u5230\u7684\uff0co1\u5f00\u53d1\u8fc7\u7a0b\u7684\u201caha\u201d\u65f6\u523b\u6b63\u662f\u53d1\u73b0\u6a21\u578b\u81ea\u53d1\u6d8c\u73b0\u8bf8\u5982\u201c\u53cd\u601d\u201d\u3001\u201c\u91cd\u8bd5\u201d\u8fd9\u6837\u7684\u201c\u601d\u8003\u56e0\u5b50\u201d\u3002<\/li>\n<\/ul>\n\n\n\n<p id=\"uc38cb631\">\u540c\u65f6\uff0c<strong>OpenAI\u975e\u5e38\u91cd\u89c6Scaling\uff0c\u503e\u5411\u4e8e\u4e0d\u8981\u6559AI\u505a\u4e8b\uff0c\u8981\u8ba9AI\u81ea\u5df1\u53bb\u5b66\u4e60\u3002<\/strong><\/p>\n\n\n\n<p id=\"ubaa63fde\">Give a man a fish, and you feed him for a day.<\/p>\n\n\n\n<p id=\"u4d533392\">Teach a man to fish, and you feed him for a lifetime.<\/p>\n\n\n\n<p id=\"ud9735a2b\">Teach him the taste of fish and make him hungry.<\/p>\n\n\n\n<p id=\"u535d3f72\">\u2014\u2014 Hyung Won Chung from OpenAI[30]<\/p>\n\n\n\n<p id=\"u0aca6330\">\u7528Token\u4f5c\u4e3aaction\u53ef\u4ee5\u6bcf\u6b21\u53d6\u5f53\u524d\u6a21\u578b\u8f93\u51fa\u7684topk\u4f5c\u4e3a\u5f53\u524d\u53ef\u9009\u7684action\uff0c\u901a\u8fc7\u8fd9\u79cd\u65b9\u5f0f\u53ef\u4ee5\u83b7\u5f97\u65e0\u9650\u5927\u7684\u89e3\u7a7a\u95f4\u3002\u4f46\u662f\u6311\u6218\u4f9d\u7136\u662f\u89e3\u7a7a\u95f4\u65e0\u9650\u5927\u5e26\u6765\u7684\u8bad\u7ec3\u96be\u9898\u3002\u53e6\u5916\uff0c\u5982\u4f55\u8bbe\u8ba1reward model\u4e5f\u662f\u4e00\u4e2a\u5f88\u96be\u7684\u4e8b\u60c5\u3002\u4e0a\u95ee\u63d0\u5230PRM\u76f8\u6bd4ORM\uff0c\u53ef\u4ee5\u7f13\u89e3CAP\u95ee\u9898\uff0c\u76d1\u7763\u4fe1\u53f7\u66f4\u591a\uff0c\u6548\u679c\u66f4\u597d\u3002\u4f46\u662f\u4f7f\u7528Token\u7ea7\u522b\u6bd4\u8f83\u96be\u5e94\u7528PRM\u6a21\u578b\uff0c\u66f4\u7ec6\u81f4\u7684TRM\uff08token reward model\uff09\u4f3c\u4e4e\u53c8\u4e0d\u592a\u6709\u6280\u672f\u53ef\u884c\u6027\u3002\u4f46\u662f\u8003\u8651\u5230OpenAI\u7684\u4e00\u4e2a\u65b9\u6cd5\u8bba\u5c31\u662f\u5047\u8bbe\u6709\u65e0\u9650\u591a\u7684\u7b97\u529b\u6765\u505a\u4e8b\u60c5\uff0c\u56e0\u6b64\u4f9d\u7136\u662f\u4e00\u4e2a\u53ef\u80fd\u7684\u65b9\u5411\u3002<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\" id=\"S7jxd\"\/>\n\n\n\n<p id=\"uc54f7cc7\"><strong>3. Patch\u4f5c\u4e3aAction<\/strong><\/p>\n\n\n\n<p id=\"u9aa77701\">\u901a\u8fc7\u5bf9\u6bd4\u5927\u91cf\u7684o1\u7684\u8f93\u51fa\uff0c\u53ef\u4ee5\u89c2\u5bdf\u5230o1\u7684\u8f93\u51fatoken\u957f\u5ea6\u6c38\u8fdc\u4e3a32\u7684\u6574\u6570\u500d\u3002\u8fd9\u542f\u53d1\u6211\u4eec\u8003\u8651\u4e0a\u9762\u4e24\u8005\u7684\u7ed3\u5408\uff1a\u5982\u679c\u6709\u4e00\u4e2a\u7ec6\u7c92\u5ea6\u4ecb\u4e8eThought-level\u548cToken-level\u4e4b\u95f4\u7684Patch-level action\uff0c\u662f\u4e0d\u662f\u5c31\u53ef\u4ee5\u89e3\u51b3\u8fd9\u4e00\u95ee\u9898\u3002<\/p>\n\n\n\n<p id=\"u500d83bc\">\u53c2\u8003ViT\u4e2d\u5206patch\u7684\u64cd\u4f5c\uff0c\u53ef\u4ee5\u5b9a\u4e49\u6bcf32\u4e2atoken\u4e3a\u4e00\u4e2apatch\uff0c\u4f5c\u4e3a\u4e00\u4e2aaction\uff0c\u8fd9\u6837\u641c\u7d22\u7a7a\u95f4\u4f1a\u6bd4Token-level\u66f4\u5c0f\uff0c\u6bd4Thought-level\u66f4\u5927\uff0c\u4e5f\u907f\u514d\u4e86\u4e3a\u6a21\u578b\u5b9a\u4e49\u5982\u4f55\u601d\u8003\u7684\u95ee\u9898\u3002<\/p>\n\n\n\n<p id=\"u5ee94412\">\u53e6\u5916\uff0c\u4ecereward model\u89d2\u5ea6\u601d\u8003\uff0c\u4f7f\u7528patch-level\u53ef\u4ee5\u590d\u7528PRM\u7684\u6280\u672f\u548c\u6570\u636e\u2014\u2014\u53ea\u9700\u8981\u5c06\u6bcf\u4e00\u4e2athoughts\u6309\u716732\u88c1\u526a\/padding\u968f\u540e\u8bad\u7ec3\u5373\u53ef\u3002\u56e0\u6b64o1\u7684\u5b9e\u9645\u505a\u6cd5\u66f4\u503e\u5411\u4e8e\u8fd9\u6837\u3002<\/p>\n\n\n\n<h5 class=\"wp-block-heading\" id=\"TSTF1\">3.1.3.1.2. \u4e00\u5b9a\u662fMCTS\u5417\uff1f<\/h5>\n\n\n\n<p id=\"u78144e6d\">\u4e0a\u9762\u6211\u4eec\u63d0\u5230\uff0c\u4f7f\u7528MCTS\u7684\u76ee\u7684\u662f\u589e\u5927\u6a21\u578b\u7684\u89e3\u7a7a\u95f4\uff0c\u4f46\u662f\u8fd9\u662f\u552f\u4e00\u4e00\u4e2a\u589e\u5927\u6a21\u578b\u89e3\u7a7a\u95f4\u7684\u65b9\u5f0f\u5417\uff1f\u7b54\u6848\u5f53\u7136\u4e0d\u662f\uff0c\u5e76\u4e14\u5f88\u591a\u4eba\u4e5f\u771f\u7684\u8ba4\u4e3a\u4e0d\u662fMCTS\u3002<\/p>\n\n\n\n<p id=\"u9416d8de\">\u6b63\u65b9\u8ba4\u4e3a\u73b0\u5728\u5df2\u6709\u5f88\u591a\u6587\u7ae0\u662f\u5728\u505aLLM+MCTS\u7684[13,14,16,24-29]\uff0c\u5e76\u4e14\u4ed6\u4eec\u81ea\u5df1\u65e9\u671f\u7684\u5de5\u4f5cFSBS[8]\u4e5f\u6709\u4e00\u70b9\u6811\u641c\u7d22\u7684\u5473\u9053\u3002<\/p>\n\n\n\n<p id=\"uc6eeef9e\">\u53cd\u65b9\u8ba4\u4e3a\u9996\u5148Noam\u4e0d\u662f\u505aMCTS\u7684\uff0c\u4f5c\u4e3ao1\u7684\u6838\u5fc3\u8d21\u732e\u8005\uff0c\u4ece\u7814\u7a76\u7684\u8fde\u8d2f\u6027\u6765\u8bf4\uff0cNoam\u4f7f\u7528\u81ea\u5df1\u64c5\u957f\u7684CFR(Counter Factural Regret)\u53ca\u5176\u53d8\u79cd\u4f3c\u4e4e\u4e5f\u662f\u4e00\u4e2a\u5408\u7406\u7684\u7b54\u6848\u3002\u5176\u6b21MCTS\u5728\u56f4\u68cb\u4e2d\u5f88\u5f3a\uff0cCFR\u5728\u5fb7\u6251\u91cc\u5f88\u5f3a\uff0c\u4f46\u662f\u4ed6\u4eec\u53ea\u5728\u5404\u81ea\u7684\u9886\u57df\u5185\u6709\u6548\uff0c\u5728LLM\u9886\u57df\u7a76\u7adf\u8c01\u66f4\u80dc\u4e00\u7b79\u4f9d\u65e7\u4e0d\u597d\u8bf4\u3002<\/p>\n\n\n\n<p id=\"u55a1758d\"><strong>\u4f46\u662f\u5c31\u50cf\u4e0a\u9762\u63d0\u5230\u7684\uff0c\u65e0\u8bba\u4f7f\u7528\u54ea\u79cd\u65b9\u5f0f\uff0c\u672c\u8d28\u5c31\u662f\u6253\u5f00\u6a21\u578b\u7684\u89e3\u7a7a\u95f4\uff0c\u53ea\u8981\u8fd9\u4e2a\u6a21\u578b\u6709\u4efb\u4f55\u4e00\u6761\u8def\u5f84\u80fd\u591f\u8fbe\u5230\u6b63\u786e\u7ed3\u679c\uff0c\u627e\u5230\u5e76\u5f3a\u5316\u8fd9\u6761\u8def\u5f84\u5c31\u53ef\u4ee5\u4e86\u3002\u56e0\u6b64\u4e2a\u4eba\u7406\u89e3\u751a\u81f3\u4e0d\u8981\u6c42\u8fd9\u4e2a\u6a21\u578b\u5e95\u5ea7\u6709\u591a\u5f3a\u3002<\/strong><\/p>\n\n\n\n<p id=\"u3028e8a7\">Through training, the models learn to refine their thinking process, try different strategies, and recognize their mistakes\u2014\u2014OpenAI<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"R5mj7\">3.1.3.2. \u5982\u4f55\u786e\u5b9aCoT\u8d28\u91cf\uff1f<\/h4>\n\n\n\n<p id=\"u13a6dabb\">\u4e2a\u4eba\u5206\u8ba4\u4e3a\uff0c\u8fd8\u662f\u503e\u5411\u4e8ePRM\u3002\u6709\u8bf4\u6cd5[15]\u8bf4\u662fORM+PRM\uff0c\u4e0d\u592a\u8ba4\u540c\uff0cORM\u5728\u5f88\u591a\u8bba\u6587\u4e2d\u90fd\u8bc1\u660eORM<strong>\u6301\u7eed<\/strong>\u6ca1\u6709PRM\u597d\uff1b\u800c\u4e14\u4ece\u8bad\u7ec3\u6570\u636e\u770b\uff0cORM\u7684\u8bad\u7ec3\u6570\u636e\u53ef\u4ee5\u8ba4\u4e3a\u662fPRM\u7684\u5b50\u96c6\uff08\u4fe1\u53f7\u66f4\u5bc6\u96c6\uff0c\u6570\u636e\u5229\u7528\u7387\u66f4\u9ad8\uff09\uff0c\u4e0d\u592a\u53ef\u80fd\u5b50\u96c6\u7684\u6548\u679c\u8981\u6bd4\u5168\u96c6\u597d\u3002<\/p>\n\n\n\n<p id=\"ua2f9cd12\">\u53e6\u5916\uff0c\u5982\u679c\u771f\u662f\u4e0a\u6587\u5206\u6790\u7684patch\u7ea7\u522b\u7684MCTS\uff0c\u5219\u66f4\u9700\u8981PRM\uff08\u6216\u8005PRM\u7684\u53d8\u79cd)\u6765\u5bf9\u6811\u8fdb\u884c\u526a\u679d\uff0c\u7f29\u5c0f\u641c\u7d22\u7a7a\u95f4\u3002<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"Vam1c\">3.1.3.3. \u5b89\u5168\u5bf9\u9f50<\/h4>\n\n\n\n<p id=\"u68c39279\">\u6307\u7684\u683c\u5916\u4e00\u63d0\u7684\u662fo1\u5b89\u5168\u5bf9\u9f50\u7b56\u7565\u3002<\/p>\n\n\n\n<p id=\"ud2a7e460\">\u76f8\u6bd4\u4f20\u7edf\u7684RLHF\u505a\u5185\u5bb9\u5b89\u5168\uff0c\u65b0\u7684o1\u66f4\u53ef\u80fd\u4f7f\u7528\u4e86Anthropic\u7684AI\u5baa\u6cd5\u6a21\u5f0f\u6765\u505a\u5185\u5bb9\u5b89\u5168\uff0c\u7ed3\u5408\u5f3a\u5927\u7684\u63a8\u7406\u80fd\u529b\uff0co1\u7684\u5b89\u5168\u6027\u4f1a\u66f4\u52a0\u4f18\u79c0\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728702905984-11255501-5a37-4b09-933a-da408c5d1ee6.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"U083k\">3.1.4. Inference\u9636\u6bb5<\/h3>\n\n\n\n<p id=\"u7d7338c1\">o1\u5982\u4f55\u8fdb\u884cinfer\u7684\uff1f\u731c\u6d4b\u662ftoken by token\u8fdb\u884c\u8f93\u51fa\uff0c\u63a8\u7406\u9636\u6bb5\u5e76\u6ca1\u6709\u4f7f\u7528MCTS\u3002\u539f\u56e0\u4e00\u4e2a\u662fo1\u4f1a\u8f93\u51fa\u9519\u8bef\u7684\u731c\u6d4b\uff0c\u5982\u679c\u662f\u6811\u641c\u7d22\u6ca1\u7406\u7531\u4f1a\u6709\u9519\u8bef\u7684\u8282\u70b9\u3002<\/p>\n\n\n\n<p id=\"u657e882c\">\u53e6\u5916\uff0c\u8fd8\u505a\u4e86\u8f93\u51fatoken\u957f\u5ea6\u548c\u8f93\u51fa\u5ef6\u65f6\u7684\u5173\u7cfb\u5b9e\u9a8c[19]\uff0c\u6574\u4f53\u662f\u7ebf\u6027\u7684\uff0c\u8bf4\u660e\u5e76\u6ca1\u6709token\u88ab\u9690\u85cf\uff0c\u8bc1\u660e\u4e86\u662f\u4e00\u4e2atoken by token\u7684\u8f93\u51fa\u7ed3\u6784\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728711281505-1b6b264d-9eeb-4bda-9cbe-a5fa6cf455f2.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u27061068\">\u4e0d\u8fc7\u8003\u8651\u5230o1\u5c24\u5176\u662fo1-mini\u7684\u4ef7\u683c\u662f4o-mini\u768420\u500d\uff0c\u56e0\u6b64\u7ebf\u4e0a\u5f88\u53ef\u80fd\u6709\u591a\u4e2a\u6a21\u578b\u5e76\u884c\uff08\u4f46\u8fd9\u5e76\u4e0d\u5f71\u54cdo1\u662f\u4e00\u4e2asingle model\u7684\u7ed3\u8bba\uff09\uff0c\u4f7f\u7528PRM\u9009\u62e9BoN\u6765\u9009\u62e9\u6700\u4f18\u7684thoughts\uff0c\u5e76\u4e14\u7528\u4e86\u52a8\u6001\u7684\u96be\u5ea6\u8ba1\u7b97\u6765\u51b3\u5b9aN[13]\u3002<\/p>\n\n\n\n<p id=\"u8ac6381e\">On the 2024 AIME exams, GPT-4o only solved on average 12% (1.8\/15) of problems. o1 averaged 74% (11.1\/15) with a single sample per problem, 83% (12.5\/15) with consensus among 64 samples, and 93% (13.9\/15) when <strong>re-ranking 1000 samples<\/strong> with a learned scoring function\u2014\u2014OpenAI<\/p>\n\n\n\n<p id=\"ue10c89b6\">\u52a8\u6001\u8ba1\u7b97\u8d44\u6e90\u9009\u62e9[13]\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728618465173-ae416024-fb97-4c60-97e9-6f657dc834dd.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u63745590\"><br><\/p>\n\n\n\n<p id=\"ubde83694\">\u90a3\u65e2\u7136o1\u662f\u4e00\u4e2atoken by token\u7684\u8f93\u51fa\uff0c\u90a3\u6a21\u578b\u4e3a\u4ec0\u4e48\u8fd9\u4e48\u8d35\u5462\uff1fo1\u7cfb\u5217\u4e0d\u4ec5\u8f93\u51fa\u4ef7\u683c\u8d35\uff0c\u8f93\u5165\u4ef7\u683c\u4e5f\u8d35\u3002\u8f93\u5165\u8f93\u51fao1-preview\u5206\u522b\u662f4o\u76844\u500d\/3\u500d\u3002mini\u7684\u8f93\u5165\u8f93\u51fa\u5747\u662f4o-mini\u768420\u500d\u3002<\/p>\n\n\n\n<p id=\"u1cc55432\">\u8f93\u51fa\u7684\u591a\u6837\u6027\u662f\u80fd\u5426\u5c06infer time scaling law\u8d70\u4e0b\u53bb\u7684\u5173\u952e\uff0c\u5426\u5219\u5373\u4f7f\u6709\u518d\u597d\u7684PRM\u4e5f\u6ca1\u529e\u6cd5\u9009\u62e9\u597d\u7684BoN\u8f93\u51fa\uff0c\u4e5f\u5c31\u8fbe\u4e0d\u5230\u9884\u8bbe\u6548\u679c\u3002<\/p>\n\n\n\n<p id=\"u2b546d82\">Ideally, test-time compute should modify the distribution so as to generate better outputs than na\u00efvely sampling from the LLM itself would. In general, there are two knobs to induce modifications to an LLM\u2019s distribution[13]<\/p>\n\n\n\n<p id=\"u6c1eaa9c\">\u5373\u4f7f\u5355\u4e2a\u6a21\u578b\u5373\u4f7f\u5f00\u9ad8T\uff0c\u591a\u6837\u6027\u8fd8\u662f\u6ca1\u6709\u529e\u6cd5\u4fdd\u8bc1\uff08\u6bd4\u5982\u4e0a\u6587self-critic\u5de5\u4f5c\u4e2d\uff0c\u6240\u6709\u7684critic\u57fa\u672c\u90fd\u6307\u660e\u4e86\u4e00\u4ef6\u4e8b\uff09\u3002<\/p>\n\n\n\n<p id=\"ucabab402\">\u56e0\u6b64\u505a\u51fa\u731c\u6d4b\uff1a<\/p>\n\n\n\n<ul>\n<li>\u8f93\u5165\u7aef\uff1a\u7531\u4e8e\u4f7f\u7528\u4e86AI\u5baa\u6cd5\uff0csystem prompt\u53d8\u957f\u5bfc\u81f4\u7684\uff1b\u4e3a\u4e86\u589e\u52a0BoN\u91c7\u6837\u7684\u591a\u6837\u6027\uff0c\u4f7f\u7528\u591a\u4e2aprompt\u751a\u81f3\u591a\u4e2a\u6a21\u578b\uff0c\u5f3a\u5236\u589e\u52a0\u591a\u6837\u6027\uff0c\u56e0\u6b64kv cache\u4e0d\u5171\u7528\uff0c\u53ea\u80fd\u589e\u52a0\u4ef7\u683c\u3002<\/li>\n\n\n\n<li>\u8f93\u51fa\u7aef\uff1a\u8fd9\u4e2a\u4ef7\u683c\u589e\u957f\u57fa\u672c\u4e0a\u5c31\u662f\u7531\u4e8e\u662f\u591a\u4e2a\u6a21\u578b\uff08\u8fd8\u8981\u6709\u6458\u8981\u6a21\u578b\uff09\u9020\u6210\u7684\u3002\u52a8\u673a\u8fd8\u662f\u524d\u9762\u63d0\u5230\u7684\u589e\u52a0\u8f93\u51fa\u591a\u6837\u6027\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"yZt7h\">3.2. o1\u7684\u590d\u73b0<\/h2>\n\n\n\n<p id=\"u3aa7ac1e\">\u76ee\u524do1\u7684\u51e0\u4e2a\u5f00\u6e90\u5de5\u4f5c\u5982\u4e0b\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table><tbody><tr><td><\/td><td>method<\/td><td>model<\/td><td>data<\/td><td>code<\/td><td>RL<\/td><\/tr><tr><td>Open-O1<\/td><td>\u274c\uff08Todo\uff09<\/td><td>\u2705<\/td><td>\u274c\uff08Todo\uff09<\/td><td>\u274c<\/td><td>\u274c\uff08Todo\uff09<\/td><\/tr><tr><td>O1-journey<\/td><td>\u2705<\/td><td>\u274c<\/td><td>\u274c<\/td><td>\u274c<\/td><td>\u2705\uff08DPO\uff09<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<figure class=\"wp-block-embed\"><div class=\"wp-block-embed__wrapper\">\nhttps:\/\/arxiv.org\/abs\/2410.02884\n<\/div><\/figure>\n\n\n\n<p id=\"uf6228e79\"><strong>\u5176\u5b9e\u5bf9\u4e8e\u590d\u73b0o1\uff0c\u6709\u4e09\u4e2a\u6838\u5fc3\u95ee\u9898\uff1a<\/strong><\/p>\n\n\n\n<ol>\n<li>\u5982\u4f55\u83b7\u53d6\u8f93\u51fa\u957fCoT(with critic)\uff1f<\/li>\n\n\n\n<li>\u5982\u4f55\u4fdd\u8bc1\u6570\u636e\u51c6\u786e\u6027\uff1freward model\uff1f<\/li>\n\n\n\n<li>\u83b7\u53d6\u5230\u6709\u6548\u6570\u636e\u540e\uff0c\u5982\u4f55\u8fdb\u884c\u540e\u7eed\u8bad\u7ec3\uff1f<\/li>\n<\/ol>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"bac0a\">3.2.1. O1-Journey[22]<\/h3>\n\n\n\n<p id=\"u104cf2d1\"><strong>\u76ee\u524d\u6ca1\u6709benchmark\u6d4b\u8bc4\u5bf9\u6bd4\u3002<\/strong><\/p>\n\n\n\n<p id=\"ub437ec9a\">\u8f93\u51fademo\uff1a<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1728631996645-bf1104f6-2810-4c64-8aec-3cfb2ed6cf52.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<p id=\"u0bf85503\">\u5bf9\u4e8eo1-journey\u56de\u7b54\u4e0a\u8ff0\u4e09\u4e2a\u95ee\u9898\uff1a<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"iEmVj\">3.2.1.1. \u65b9\u6cd5<\/h4>\n\n\n\n<h5 class=\"wp-block-heading\" id=\"T8z2O\">3.2.1.1.1. \u5982\u4f55\u83b7\u53d6\u957fCoT\u8f93\u51fa\uff1f<\/h5>\n\n\n\n<p id=\"ufceef1f5\">\u901a\u8fc7\u4e00\u4e2a\u53ebjournrey\u7684\u65b9\u5f0f\u3002\u5047\u8bbe\u76ee\u524d\u67092\u6761\u63a8\u7406\u8def\u5f84\uff08Math\u6a21\u578b\u81ea\u6709\u7684CoT\u8def\u5f84\uff09\uff1a<\/p>\n\n\n\n<p id=\"u44419433\">question-&gt;step0-&gt;step1-&gt;&#8230;.-&gt;step6-&gt;right_answer \uff08\u8fd9\u7c7b\u6837\u672c\u53ebshotcut\uff09<\/p>\n\n\n\n<p id=\"u7a7207f6\">\u8fd8\u6709\u9519\u8bef\u8def\u5f84\uff0c\u4f8b\u5982<\/p>\n\n\n\n<p id=\"u829ae8df\">question-&gt;step0&#8242;-&gt;step1&#8242;-&gt;&#8230;.-&gt;step6&#8242;-&gt;wrong_answer<\/p>\n\n\n\n<p id=\"ua96499c0\">\u6781\u957f\u601d\u7ef4\u94fe\u7b49\u4e8e<\/p>\n\n\n\n<p id=\"u0e65f5e9\">&#8220;question-&gt;step0&#8242;-&gt;step1&#8242;-&gt;&#8230;.-&gt;step6&#8242;-&gt;wrong_answer&#8221;+ &#8220;emmm, \u597d\u50cf\u4e0d\u592a\u5bf9\uff0c\u6211\u60f3\u4e00\u4e0b&#8221; + &#8220;step0-&gt;step1-&gt;&#8230;.-&gt;step6-&gt;right_answer&#8221;<\/p>\n\n\n\n<p id=\"u3c46dcb1\">\u7c7b\u4f3c\u7684\u8fd8\u6709\u5176\u4ed6\u884c\u4e3a\uff0creflections, corrections, reasoning\uff0c\u968f\u540e\u7ecf\u8fc7GPT-4o\u6539\u5199\uff0c\u83b7\u5f97\u957fCoT\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1729156879763-2b2dc8e5-2324-4548-bcf4-2c283b10f361.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<h5 class=\"wp-block-heading\" id=\"bdZ5E\">3.2.1.1.2. \u5982\u4f55\u4fdd\u8bc1\u8f93\u51fa\u51c6\u786e\u6027\uff08reward model\uff09<\/h5>\n\n\n\n<p id=\"u50918f25\">\u4f7f\u7528\u4e0a\u9762\u7684\u6570\u636e\u8bad\u7ec3\u4e00\u4e2asft\u6a21\u578b\uff0c\u5728\u83b7\u53d6\u5230\u53ef\u4ee5\u8f93\u51fa\u957fCoT\u7684\u6a21\u578b\uff0c\u4f46\u662f\u4e00\u4e2a\u95ee\u9898\u662f\u4e0a\u8ff0\u6b65\u9aa4\u5f88\u660e\u663e\u6709\u8f83\u591a\u5197\u4f59\u6b65\u9aa4\u3002<\/p>\n\n\n\n<p id=\"uaf3af518\">\u56e0\u6b64\u751f\u6210\u4e00\u6279\u9ad8\u8d28\u91cfCoT\u6570\u636e\u7ee7\u7eed\u5fae\u8c03sft\u6a21\u578b\u3002<\/p>\n\n\n\n<p id=\"ue6498052\">\u8fd9\u4e00\u6b65\u4f7f\u7528PRM+o1\u4f5c\u4e3a\u5956\u52b1\u6a21\u578b\u6784\u5efa\u6570\u636e\uff0c\u4f7f\u7528\u5fae\u8c03\u540e\u7684DeepSeekMath-7B-Base\uff0c\u8fdb\u884c<strong>beam search<\/strong>\u641c\u7d22\u4e0e\u526a\u679d\u3002<\/p>\n\n\n\n<p id=\"u534a549d\">\u5bf9\u6bd4\u4e86\u591a\u4e2a\u6a21\u578b\u4f5c\u4e3areward_model\u3002\u4f46\u662f\u5e76\u6ca1\u6709\u6570\u636e\u8868\u660e\u8fd9\u4e9breward\u7684diff\u591a\u5927\u7a0b\u5ea6\u5f71\u54cd\u540e\u7eed\u6b65\u9aa4\u7684\u7ed3\u679c\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image\"><img decoding=\"async\" src=\"https:\/\/intranetproxy.alipay.com\/skylark\/lark\/0\/2024\/png\/147556570\/1729156947957-6462670a-36d8-4d15-953c-016b8c3a45c8.png\" alt=\"\" title=\"\"\/><\/figure>\n\n\n\n<h5 class=\"wp-block-heading\" id=\"xYzHm\">3.2.1.1.3. \u540e\u7eed\u8bad\u7ec3<\/h5>\n\n\n\n<p id=\"u74dfcf44\">\u73b0\u5728\u83b7\u5f97\u4e86\u53ef\u4ee5\u8f93\u51fa\u4e0d\u5197\u4f59\u957fCoT\u7684\u6a21\u578b\uff0c\u63a5\u4e0b\u6765\u5c31\u662f\u5fae\u8c03\u4e00\u4e2asft\u6a21\u578b\uff0c\u4f7f\u5176\u53ef\u4ee5\u8f93\u51fa\u4e0d\u5197\u4f59\u957fCoT\u6570\u636e\u3002<\/p>\n\n\n\n<p id=\"u0642c4f9\">\u4e3b\u8981\u5206\u4e3a\u4ee5\u4e0b\u51e0\u6b65\uff1a<\/p>\n\n\n\n<ol>\n<li>\u4f7f\u7528Abel\u548cPRM800k\u5fae\u8c03DeepSeek-math-7b-base\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528\u4e0a\u4e00\u6b65\u751f\u6210\u7684\u957fCoT\u4f4e\u5197\u4f59\u6570\u636e\uff08327\u4e2a\uff09\u5fae\u8c03sft\u6a21\u578b\u3002<\/li>\n\n\n\n<li>\u4f7f\u7528PRM800k\u6570\u636e\u96c6\u91c7\u6837\u7684prompt\uff0c\u5bf9\u7b2c\u4e8c\u6b65\u7684\u6a21\u578b\u7ee7\u7eed\u91c7\u683720\u4e2aresponse\u3002\u5176\u4e2d\u7ed3\u679c\u6b63\u786e\u7684\u4e3achosen\uff0c\u7ed3\u679c\u9519\u8bef\u7684\u4e3areject\u3002\u5bf9\u6b63\u8d1f\u6837\u672c\u8fdb\u884cDPO\u5fae\u8c03\u3002<\/li>\n<\/ol>\n\n\n\n<h5 class=\"wp-block-heading\" id=\"Hudqw\">3.2.1.1.4. \u603b\u7ed3<\/h5>\n\n\n\n<p id=\"uf7147922\">\u521b\u5efa\u53ef\u4ee5\u8f93\u51fa\u957fCoT\u7684\u6a21\u578b-&gt;\u9009\u62e9\u4e00\u6279\u597d\u6570\u636e-&gt;\u8bad\u7ec3\u4e00\u4e2a\u53ef\u4ee5\u8f93\u51fa\u957fCoT\u5e76\u4e14\u6bcf\u4e00\u6b65\u90fd\u9ad8reward\u7684\u6a21\u578b-&gt;DPO\u4f18\u5316\u3002<\/p>\n\n\n\n<h1 class=\"wp-block-heading\" id=\"pEjyU\">4. \u5f15\u7528<\/h1>\n\n\n\n<ol>\n<li>Training Verifiers to Solve Math Word Problems. <a href=\"http:\/\/arxiv.org\/abs\/2110.14168\" target=\"_blank\" rel=\"noreferrer noopener\">http:\/\/arxiv.org\/abs\/2110.14168<\/a><\/li>\n\n\n\n<li>Let&#8217;s Verify Step by Step. <a href=\"http:\/\/arxiv.org\/abs\/2305.20050\" target=\"_blank\" rel=\"noreferrer noopener\">http:\/\/arxiv.org\/abs\/2305.20050<\/a><\/li>\n\n\n\n<li>Steps Toward Artificial Intelligence*. <a href=\"https:\/\/courses.csail.mit.edu\/6.803\/pdf\/steps.pdf\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/courses.csail.mit.edu\/6.803\/pdf\/steps.pdf<\/a><\/li>\n\n\n\n<li>What is the credit assignment problem? <a href=\"https:\/\/ai.stackexchange.com\/questions\/12908\/what-is-the-credit-assignment-problem\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/ai.stackexchange.com\/questions\/12908\/what-is-the-credit-assignment-problem<\/a><\/li>\n\n\n\n<li>Show Your Work: Scratchpads for Intermediate Computation with Language Models. <a href=\"https:\/\/arxiv.org\/abs\/2112.00114\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/abs\/2112.00114<\/a><\/li>\n\n\n\n<li>Chain-of-Thought Prompting Elicits Reasoning in Large Language Models. <a href=\"https:\/\/arxiv.org\/abs\/2201.11903\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/abs\/2201.11903<\/a><\/li>\n\n\n\n<li>Self-critiquing models for assisting human evaluators. <a href=\"http:\/\/arxiv.org\/abs\/2206.05802\" target=\"_blank\" rel=\"noreferrer noopener\">http:\/\/arxiv.org\/abs\/2206.05802<\/a><\/li>\n\n\n\n<li>LLM Critics Help Catch LLM Bugs. <a href=\"http:\/\/arxiv.org\/abs\/2407.00215\" target=\"_blank\" rel=\"noreferrer noopener\">http:\/\/arxiv.org\/abs\/2407.00215<\/a><\/li>\n\n\n\n<li>Constitutional AI: Harmlessness from AI Feedback. <a href=\"http:\/\/arxiv.org\/abs\/2212.08073\" target=\"_blank\" rel=\"noreferrer noopener\">http:\/\/arxiv.org\/abs\/2212.08073<\/a><\/li>\n\n\n\n<li>STaR: Self-Taught Reasoner Bootstrapping Reasoning With Reasoning. <a href=\"https:\/\/arxiv.org\/abs\/2203.14465\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/abs\/2203.14465<\/a><\/li>\n\n\n\n<li>Towards Revealing the Mystery behind Chain of Thought: A Theoretical Perspective. <a href=\"https:\/\/arxiv.org\/abs\/2305.15408\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/abs\/2305.15408<\/a><\/li>\n\n\n\n<li>Chain of Thought Empowers Transformers to Solve Inherently Serial Problems. <a href=\"https:\/\/arxiv.org\/abs\/2402.12875\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/abs\/2402.12875<\/a><\/li>\n\n\n\n<li>Scaling LLM Test-Time Compute Optimally can be More Effective than Scaling Model Parameters. <a href=\"http:\/\/arxiv.org\/abs\/2408.03314\" target=\"_blank\" rel=\"noreferrer noopener\">http:\/\/arxiv.org\/abs\/2408.03314<\/a><\/li>\n\n\n\n<li>Accessing GPT-4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self-refine with LLaMa-3 8B: A Technical Report. <a href=\"https:\/\/arxiv.org\/pdf\/2406.07394\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2406.07394<\/a><\/li>\n\n\n\n<li>Reverse-o1:OpenAI o1\u539f\u7406\u9006\u5411\u5de5\u7a0b\u56fe\u89e3. <a href=\"https:\/\/zhuanlan.zhihu.com\/p\/721952915\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/zhuanlan.zhihu.com\/p\/721952915<\/a><\/li>\n\n\n\n<li>MUTUAL REASONING MAKES SMALLER LLMS STRONGER PROBLEM-SOLVERS. <a href=\"https:\/\/arxiv.org\/pdf\/2408.06195\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2408.06195<\/a><\/li>\n\n\n\n<li>Learning to Reason with LLMs. <a href=\"https:\/\/openai.com\/index\/learning-to-reason-with-llms\/\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/openai.com\/index\/learning-to-reason-with-llms\/<\/a><\/li>\n\n\n\n<li>O1 journey. <a href=\"https:\/\/github.com\/GAIR-NLP\/O1-Journey?tab=readme-ov-file\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/github.com\/GAIR-NLP\/O1-Journey<\/a><\/li>\n\n\n\n<li>Reasoning Series, Part 1: Understanding GPT-o1. <a href=\"https:\/\/leehanchung.github.io\/blogs\/2024\/10\/08\/reasoning-understanding-o1\/\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/leehanchung.github.io\/blogs\/2024\/10\/08\/reasoning-understanding-o1\/<\/a><\/li>\n\n\n\n<li>OpenAI o1 System Card. <a href=\"https:\/\/openai.com\/index\/openai-o1-system-card\/\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/openai.com\/index\/openai-o1-system-card\/<\/a><\/li>\n\n\n\n<li>A prompt that helps Claude 3.5 Sonnet beat OpenAI&#8217;s o1 model in reasoning! <a href=\"https:\/\/x.com\/JeremyNguyenPhD\/status\/1842888290376261668\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/x.com\/JeremyNguyenPhD\/status\/1842888290376261668<\/a><\/li>\n\n\n\n<li>O1 Replication Journey: A Strategic Progress Report. <a href=\"https:\/\/github.com\/GAIR-NLP\/O1-Journey?tab=readme-ov-file\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/github.com\/GAIR-NLP\/O1-Journey<\/a><\/li>\n\n\n\n<li>Open O1: A Model Matching Proprietary Power with Open-Source Innovation. <a href=\"https:\/\/github.com\/OpenSource-O1\/Open-O1\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/github.com\/OpenSource-O1\/Open-O1<\/a><\/li>\n\n\n\n<li>ALPHAZERO-LIKE TREE-SEARCH CAN GUIDE LARGE<\/li>\n<\/ol>\n\n\n\n<p id=\"uf40eea73\">LANGUAGE MODEL DECODING AND TRAINING. <a href=\"https:\/\/arxiv.org\/pdf\/2309.17179v1\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2309.17179v1<\/a><\/p>\n\n\n\n<ol start=\"25\">\n<li>Towards Self-Improvement of LLMs via MCTS: Leveraging Stepwise Knowledge with Curriculum Preference Learning <a href=\"https:\/\/arxiv.org\/pdf\/2410.06508\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2410.06508<\/a><\/li>\n\n\n\n<li>Accessing GPT-4 level Mathematical Olympiad Solutions via Monte Carlo Tree Self-refine with LLaMa-3 8B: A Technical Report <a href=\"https:\/\/arxiv.org\/pdf\/2406.07394\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2406.07394<\/a><\/li>\n\n\n\n<li>Towards Self-Improvement of LLMs via MCTS: Leveraging Stepwise Knowledge with Curriculum Preference Learning <a href=\"https:\/\/arxiv.org\/pdf\/2410.06508\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2410.06508<\/a><\/li>\n\n\n\n<li>Interpretable Contrastive Monte Carlo Tree Search Reasoning <a href=\"https:\/\/arxiv.org\/pdf\/2410.01707\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2410.01707<\/a><\/li>\n\n\n\n<li>Monte Carlo Tree Search Boosts Reasoning via Iterative Preference Learning <a href=\"https:\/\/arxiv.org\/pdf\/2405.00451\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/arxiv.org\/pdf\/2405.00451<\/a><\/li>\n\n\n\n<li>MIT EI seminar, Hyung Won Chung from OpenAI. &#8220;Don&#8217;t teach. Incentivize.&#8221; <a href=\"https:\/\/www.youtube.com\/watch?v=kYWUEV_e2ss\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/www.youtube.com\/watch?v=kYWUEV_e2ss<\/a><\/li>\n\n\n\n<li>What&#8217;s the training pipeline of PRM\uff1f <a href=\"https:\/\/github.com\/openai\/prm800k\/issues\/3\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/github.com\/openai\/prm800k\/issues\/3<\/a><\/li>\n\n\n\n<li>Questions about implmentation detail. <a href=\"https:\/\/github.com\/openai\/prm800k\/issues\/7\" target=\"_blank\" rel=\"noreferrer noopener\">https:\/\/github.com\/openai\/prm800k\/issues\/7<\/a><\/li>\n<\/ol>\n","protected":false},"excerpt":{"rendered":"<p>\u6ce8\uff1a\u672c\u6587\u5199\u4e8e2024\u5e749\u670824\u65e5\uff0c\u6709\u4e9b\u6280\u672f\u731c\u6d4b\u53ef\u80fd\u5df2\u7ecf\u88ab\u8bc1\u4f2a\u6216\u8fc7\u65f6\u3002\u4f46\u662f\u4e3b\u7ebf\u6280\u672f\u6ca1\u95ee\u9898\uff0c\u53ea\u662f\u6700\u540e\u7684\u6280 [&hellip;]<\/p>\n","protected":false},"author":2,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"om_disable_all_campaigns":false,"_mi_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"footnotes":""},"categories":[9],"tags":[],"views":2384,"_links":{"self":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/4568"}],"collection":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/comments?post=4568"}],"version-history":[{"count":2,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/4568\/revisions"}],"predecessor-version":[{"id":4571,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/posts\/4568\/revisions\/4571"}],"wp:attachment":[{"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/media?parent=4568"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/categories?post=4568"},{"taxonomy":"post_tag","embeddable":true,"href":"http:\/\/www.sniper97.cn\/index.php\/wp-json\/wp\/v2\/tags?post=4568"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}