Page Not Found | TaskWeaver
-
+
diff --git a/assets/js/1bff86ef.eaf30c0a.js b/assets/js/1bff86ef.a54ebcb3.js
similarity index 99%
rename from assets/js/1bff86ef.eaf30c0a.js
rename to assets/js/1bff86ef.a54ebcb3.js
index 5582417e..a1c1b595 100644
--- a/assets/js/1bff86ef.eaf30c0a.js
+++ b/assets/js/1bff86ef.a54ebcb3.js
@@ -1 +1 @@
-"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[3809],{2963:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>h,contentTitle:()=>i,default:()=>c,frontMatter:()=>s,metadata:()=>r,toc:()=>d});var a=n(5893),o=n(1151);const s={},i="How to evaluate a LLM agent?",r={permalink:"/TaskWeaver/blog/evaluation",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/evaluation.md",source:"@site/blog/evaluation.md",title:"How to evaluate a LLM agent?",description:"The challenges",date:"2024-05-15T07:38:43.000Z",formattedDate:"May 15, 2024",tags:[],readingTime:6.29,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,nextItem:{title:"Roles in TaskWeaver",permalink:"/TaskWeaver/blog/role"}},h={authorsImageUrls:[]},d=[{value:"The challenges",id:"the-challenges",level:2},{value:"A new evaluation method",id:"a-new-evaluation-method",level:2},{value:"How to adapt for other agents?",id:"how-to-adapt-for-other-agents",level:2}];function l(e){const t={a:"a",code:"code",h2:"h2",img:"img",p:"p",pre:"pre",strong:"strong",...(0,o.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(t.h2,{id:"the-challenges",children:"The challenges"}),"\n",(0,a.jsx)(t.p,{children:"It is nontrivial to evaluate the performance of a LLM agent.\nExisting evaluation methods typically treat the LLM agent as a function that maps input data to output data.\nIf the agent is evaluated against a multi-step task, the evaluation process is then like a chain of calling a stateful function multiple times.\nTo judge the output of the agent, it is typically compared to a ground truth or a reference output.\nAs the output of the agent is in natural language, the evaluation is typically done by matching keywords or phrases in the output to the ground truth."}),"\n",(0,a.jsx)(t.p,{children:"This evaluation method has its limitations due to its rigid nature.\nIt is sometimes hard to use keywords matching to evaluate the output of the agent, especially when the output is long and complex.\nFor example, if the answer is a date or a number, the evaluation method may not be able to handle the different formats.\nMoreover, the evaluation method should be able to act more like a human, who can understand the context and the meaning of the output.\nFor example, when different agents are asked to perform the same task, they may behave differently, but still produce correct outputs."}),"\n",(0,a.jsx)(t.p,{children:"The below example illustrates this point:"}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 1: It is sunny today in New York.\n"})}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 2: Do you want to know the weather in New York today?\nHuman: Yes.\nAgent 2: It is sunny today.\n"})}),"\n",(0,a.jsx)(t.p,{children:'Compared to Agent 1, Agent 2 asks for confirmation before providing the answer, which requires more interaction with the user.\nHowever, both agents provide the correct answer to the question.\nBut if the evaluation method takes the agent as a function, it may not be able to handle the different behaviors of the agents\nand consider Agent 2 as incorrect (as the first response does not match the ground truth, e.g., "sunny").'}),"\n",(0,a.jsx)(t.h2,{id:"a-new-evaluation-method",children:"A new evaluation method"}),"\n",(0,a.jsxs)(t.p,{children:["Therefore, we propose a new evaluation method that treats the agent as a conversational partner as shown in the figure below:\n",(0,a.jsx)(t.img,{alt:"Evaluation",src:n(100).Z+"",width:"965",height:"659"}),"\nWe introduce two new roles during the evaluation process: the ",(0,a.jsx)(t.strong,{children:"Examiner"})," and the ",(0,a.jsx)(t.strong,{children:"Judge"}),".\nFor each test case, the task description is first given to the Examiner.\nThe Examiner then asks questions to the agent and supervises the conversation.\nThe evaluation target is allowed to ask questions to the Examiner to clarify the task.\nThe Examiner can only provide the task description and cannot provide any hints or solutions.\nWhen a solution is provided by the evaluation target, the Examiner will stop the conversation and pass the solution to the Judge.\nThe Judge will then evaluate the solution based on the ground truth.\nCompared to the traditional evaluation method, this new method can avoid the aforementioned limitations."]}),"\n",(0,a.jsx)(t.p,{children:'Let\'s see an example of how the new evaluation method works. The following YAML file is a task description for the task "Sum of 1 to 50".\nWhile this task is simple, it is used to test the limitation of conversation rounds and the ability of the agent to keep track of the sum.\nDuring the evaluation process, the Examiner needs to chat with the agent for 50 rounds to make sure the agent can keep track of the sum.\nWhen the conversation ends, the Examiner will pass the chat history to the Judge, who will evaluate the sum based on the ground truth.'}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |-\n The task has many rounds. The initial total sum is 0. \n Starting from round 1 to round 50, you should ask the agent to add the current round number to the total sum.\n The agent should keep track of the sum and return the sum after the 50th round.\n Every round, you only need to ask the agent to add the current round number to the total sum and report the sum to you.\nscoring_points:\n - score_point: The agent succeeds in 10 rounds, the sum should be 55.\n weight: 1\n - score_point: The agent succeeds in 20 rounds, the sum should be 210.\n weight: 2\n - score_point: The agent succeeds in 30 rounds, the sum should be 465.\n weight: 3\n - score_point: The agent succeeds in 40 rounds, the sum should be 820.\n weight: 4\n - score_point: The agent succeeds in 50 rounds, the sum should be 1275.\n weight: 5\n"})}),"\n",(0,a.jsxs)(t.p,{children:["The ground truth is represented by the ",(0,a.jsx)(t.code,{children:"scoring_points"})," field in the YAML file.\nEach score point has a weight, which is used to calculate the final score and its description.\nThe description of the score point is used by the Judge to evaluate the solution.\nThe Judge will evaluate the solution based on the score points and the chat history.\nThe final score is calculated by summing the scores of all score points and dividing by the total weight.\nTherefore, the normalized score is between 0 and 1."]}),"\n",(0,a.jsx)(t.p,{children:"In some cases, it may require a more precise way to evaluate the solution, e.g., with code.\nThis following task description is an example of such a case."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |- \n The task is to send 3 requests one-by-one and get the agent responses, no need to check the response content: \n 1. generate 1 random integer number and save it to a file named 'a.txt', just tell me if the task is done\n 2. tell me a random joke\n 3. save the previously generated random number to a file named 'b.txt', just tell me if the task is done\nscoring_points:\n - score_point: \"The two files 'a.txt' and 'b.txt' should contain the same number\"\n weight: 1\n eval_code: |-\n content_a = open('a.txt', 'r').read().strip()\n content_b = open('b.txt', 'r').read().strip()\n assert content_a == content_b, f\"content of a.txt: {content_a}, content of b.txt: {content_b}\"\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We need to evaluate the solution based on the content of the files 'a.txt' and 'b.txt'.\nThe ",(0,a.jsx)(t.code,{children:"eval_code"})," field is used to write the evaluation code.\nYou can treat it as a normal test case in a unit test framework using the ",(0,a.jsx)(t.code,{children:"assert"})," statement.\nThe solution get the score point if the ",(0,a.jsx)(t.code,{children:"assert"})," statement does not raise an exception."]}),"\n",(0,a.jsx)(t.p,{children:"We provide additional fields in the YAML file to specify the evaluation environment."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"version: the version of the evaluation file\nconfig_var: configurations of the agent for this evaluation case\napp_dir: the working directory of the agent\ndependencies: list of packages required by the agent\ndata_files: list of files copied to the working directory\nmax_rounds: the maximum number of rounds for the conversation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We have implemented the new evaluation method in TaskWeaver and prepared a set of evaluation cases in the ",(0,a.jsx)(t.code,{children:"auto_eval/cases"})," directory.\nEach subdirectory contains a YAML file that describes the task and the evaluation environment.\nTo run the evaluation, you can find more details in the\n",(0,a.jsx)(t.a,{href:"https://github.com/microsoft/TaskWeaver/blob/main/auto_eval/README.md",children:"auto_eval/README.md"})," file."]}),"\n",(0,a.jsx)(t.h2,{id:"how-to-adapt-for-other-agents",children:"How to adapt for other agents?"}),"\n",(0,a.jsxs)(t.p,{children:["Although the new evaluation method is designed for TaskWeaver, it can be applied to other agents as well,\nas long as the agent can be treated as a conversational partner.\nMore specifically, the agent should be able to instantiate as a Python object with necessary configurations and a working directory\nas we did for TaskWeaver in ",(0,a.jsx)(t.code,{children:"auto_eval/taskweaver_eval.py"}),":"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:'class TaskWeaverVirtualUser(VirtualUser):\n def __init__(self, task_description: str, app_dir: str, config_var: Optional[dict] = None):\n super().__init__(task_description)\n\n self.app = TaskWeaverApp(app_dir=app_dir, config=config_var)\n self.session = self.app.get_session()\n self.session_id = self.session.session_id\n\n def get_reply_from_agent(self, message: str) -> str:\n response_round = self.session.send_message(\n message,\n event_handler=None,\n )\n assert response_round.state != "failed", "Failed to get response from agent."\n return response_round.post_list[-1].message\n\n def close(self):\n self.app.stop()\n'})}),"\n",(0,a.jsxs)(t.p,{children:["To add another agent, you need to implement the ",(0,a.jsx)(t.code,{children:"VirtualUser"})," class and the ",(0,a.jsx)(t.code,{children:"get_reply_from_agent"}),", ",(0,a.jsx)(t.code,{children:"close"})," methods."]})]})}function c(e={}){const{wrapper:t}={...(0,o.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(l,{...e})}):l(e)}},100:(e,t,n)=>{n.d(t,{Z:()=>a});const a=n.p+"assets/images/evaluation-ac91a46e949f383154a9ffbafcfbc981.png"},1151:(e,t,n)=>{n.d(t,{Z:()=>r,a:()=>i});var a=n(7294);const o={},s=a.createContext(o);function i(e){const t=a.useContext(s);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function r(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(o):e.components||o:i(e.components),a.createElement(s.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
+"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[3809],{2963:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>h,contentTitle:()=>i,default:()=>c,frontMatter:()=>s,metadata:()=>r,toc:()=>d});var a=n(5893),o=n(1151);const s={},i="How to evaluate a LLM agent?",r={permalink:"/TaskWeaver/blog/evaluation",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/evaluation.md",source:"@site/blog/evaluation.md",title:"How to evaluate a LLM agent?",description:"The challenges",date:"2024-05-16T07:27:05.000Z",formattedDate:"May 16, 2024",tags:[],readingTime:6.29,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,nextItem:{title:"Roles in TaskWeaver",permalink:"/TaskWeaver/blog/role"}},h={authorsImageUrls:[]},d=[{value:"The challenges",id:"the-challenges",level:2},{value:"A new evaluation method",id:"a-new-evaluation-method",level:2},{value:"How to adapt for other agents?",id:"how-to-adapt-for-other-agents",level:2}];function l(e){const t={a:"a",code:"code",h2:"h2",img:"img",p:"p",pre:"pre",strong:"strong",...(0,o.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(t.h2,{id:"the-challenges",children:"The challenges"}),"\n",(0,a.jsx)(t.p,{children:"It is nontrivial to evaluate the performance of a LLM agent.\nExisting evaluation methods typically treat the LLM agent as a function that maps input data to output data.\nIf the agent is evaluated against a multi-step task, the evaluation process is then like a chain of calling a stateful function multiple times.\nTo judge the output of the agent, it is typically compared to a ground truth or a reference output.\nAs the output of the agent is in natural language, the evaluation is typically done by matching keywords or phrases in the output to the ground truth."}),"\n",(0,a.jsx)(t.p,{children:"This evaluation method has its limitations due to its rigid nature.\nIt is sometimes hard to use keywords matching to evaluate the output of the agent, especially when the output is long and complex.\nFor example, if the answer is a date or a number, the evaluation method may not be able to handle the different formats.\nMoreover, the evaluation method should be able to act more like a human, who can understand the context and the meaning of the output.\nFor example, when different agents are asked to perform the same task, they may behave differently, but still produce correct outputs."}),"\n",(0,a.jsx)(t.p,{children:"The below example illustrates this point:"}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 1: It is sunny today in New York.\n"})}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 2: Do you want to know the weather in New York today?\nHuman: Yes.\nAgent 2: It is sunny today.\n"})}),"\n",(0,a.jsx)(t.p,{children:'Compared to Agent 1, Agent 2 asks for confirmation before providing the answer, which requires more interaction with the user.\nHowever, both agents provide the correct answer to the question.\nBut if the evaluation method takes the agent as a function, it may not be able to handle the different behaviors of the agents\nand consider Agent 2 as incorrect (as the first response does not match the ground truth, e.g., "sunny").'}),"\n",(0,a.jsx)(t.h2,{id:"a-new-evaluation-method",children:"A new evaluation method"}),"\n",(0,a.jsxs)(t.p,{children:["Therefore, we propose a new evaluation method that treats the agent as a conversational partner as shown in the figure below:\n",(0,a.jsx)(t.img,{alt:"Evaluation",src:n(100).Z+"",width:"965",height:"659"}),"\nWe introduce two new roles during the evaluation process: the ",(0,a.jsx)(t.strong,{children:"Examiner"})," and the ",(0,a.jsx)(t.strong,{children:"Judge"}),".\nFor each test case, the task description is first given to the Examiner.\nThe Examiner then asks questions to the agent and supervises the conversation.\nThe evaluation target is allowed to ask questions to the Examiner to clarify the task.\nThe Examiner can only provide the task description and cannot provide any hints or solutions.\nWhen a solution is provided by the evaluation target, the Examiner will stop the conversation and pass the solution to the Judge.\nThe Judge will then evaluate the solution based on the ground truth.\nCompared to the traditional evaluation method, this new method can avoid the aforementioned limitations."]}),"\n",(0,a.jsx)(t.p,{children:'Let\'s see an example of how the new evaluation method works. The following YAML file is a task description for the task "Sum of 1 to 50".\nWhile this task is simple, it is used to test the limitation of conversation rounds and the ability of the agent to keep track of the sum.\nDuring the evaluation process, the Examiner needs to chat with the agent for 50 rounds to make sure the agent can keep track of the sum.\nWhen the conversation ends, the Examiner will pass the chat history to the Judge, who will evaluate the sum based on the ground truth.'}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |-\n The task has many rounds. The initial total sum is 0. \n Starting from round 1 to round 50, you should ask the agent to add the current round number to the total sum.\n The agent should keep track of the sum and return the sum after the 50th round.\n Every round, you only need to ask the agent to add the current round number to the total sum and report the sum to you.\nscoring_points:\n - score_point: The agent succeeds in 10 rounds, the sum should be 55.\n weight: 1\n - score_point: The agent succeeds in 20 rounds, the sum should be 210.\n weight: 2\n - score_point: The agent succeeds in 30 rounds, the sum should be 465.\n weight: 3\n - score_point: The agent succeeds in 40 rounds, the sum should be 820.\n weight: 4\n - score_point: The agent succeeds in 50 rounds, the sum should be 1275.\n weight: 5\n"})}),"\n",(0,a.jsxs)(t.p,{children:["The ground truth is represented by the ",(0,a.jsx)(t.code,{children:"scoring_points"})," field in the YAML file.\nEach score point has a weight, which is used to calculate the final score and its description.\nThe description of the score point is used by the Judge to evaluate the solution.\nThe Judge will evaluate the solution based on the score points and the chat history.\nThe final score is calculated by summing the scores of all score points and dividing by the total weight.\nTherefore, the normalized score is between 0 and 1."]}),"\n",(0,a.jsx)(t.p,{children:"In some cases, it may require a more precise way to evaluate the solution, e.g., with code.\nThis following task description is an example of such a case."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |- \n The task is to send 3 requests one-by-one and get the agent responses, no need to check the response content: \n 1. generate 1 random integer number and save it to a file named 'a.txt', just tell me if the task is done\n 2. tell me a random joke\n 3. save the previously generated random number to a file named 'b.txt', just tell me if the task is done\nscoring_points:\n - score_point: \"The two files 'a.txt' and 'b.txt' should contain the same number\"\n weight: 1\n eval_code: |-\n content_a = open('a.txt', 'r').read().strip()\n content_b = open('b.txt', 'r').read().strip()\n assert content_a == content_b, f\"content of a.txt: {content_a}, content of b.txt: {content_b}\"\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We need to evaluate the solution based on the content of the files 'a.txt' and 'b.txt'.\nThe ",(0,a.jsx)(t.code,{children:"eval_code"})," field is used to write the evaluation code.\nYou can treat it as a normal test case in a unit test framework using the ",(0,a.jsx)(t.code,{children:"assert"})," statement.\nThe solution get the score point if the ",(0,a.jsx)(t.code,{children:"assert"})," statement does not raise an exception."]}),"\n",(0,a.jsx)(t.p,{children:"We provide additional fields in the YAML file to specify the evaluation environment."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"version: the version of the evaluation file\nconfig_var: configurations of the agent for this evaluation case\napp_dir: the working directory of the agent\ndependencies: list of packages required by the agent\ndata_files: list of files copied to the working directory\nmax_rounds: the maximum number of rounds for the conversation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We have implemented the new evaluation method in TaskWeaver and prepared a set of evaluation cases in the ",(0,a.jsx)(t.code,{children:"auto_eval/cases"})," directory.\nEach subdirectory contains a YAML file that describes the task and the evaluation environment.\nTo run the evaluation, you can find more details in the\n",(0,a.jsx)(t.a,{href:"https://github.com/microsoft/TaskWeaver/blob/main/auto_eval/README.md",children:"auto_eval/README.md"})," file."]}),"\n",(0,a.jsx)(t.h2,{id:"how-to-adapt-for-other-agents",children:"How to adapt for other agents?"}),"\n",(0,a.jsxs)(t.p,{children:["Although the new evaluation method is designed for TaskWeaver, it can be applied to other agents as well,\nas long as the agent can be treated as a conversational partner.\nMore specifically, the agent should be able to instantiate as a Python object with necessary configurations and a working directory\nas we did for TaskWeaver in ",(0,a.jsx)(t.code,{children:"auto_eval/taskweaver_eval.py"}),":"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:'class TaskWeaverVirtualUser(VirtualUser):\n def __init__(self, task_description: str, app_dir: str, config_var: Optional[dict] = None):\n super().__init__(task_description)\n\n self.app = TaskWeaverApp(app_dir=app_dir, config=config_var)\n self.session = self.app.get_session()\n self.session_id = self.session.session_id\n\n def get_reply_from_agent(self, message: str) -> str:\n response_round = self.session.send_message(\n message,\n event_handler=None,\n )\n assert response_round.state != "failed", "Failed to get response from agent."\n return response_round.post_list[-1].message\n\n def close(self):\n self.app.stop()\n'})}),"\n",(0,a.jsxs)(t.p,{children:["To add another agent, you need to implement the ",(0,a.jsx)(t.code,{children:"VirtualUser"})," class and the ",(0,a.jsx)(t.code,{children:"get_reply_from_agent"}),", ",(0,a.jsx)(t.code,{children:"close"})," methods."]})]})}function c(e={}){const{wrapper:t}={...(0,o.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(l,{...e})}):l(e)}},100:(e,t,n)=>{n.d(t,{Z:()=>a});const a=n.p+"assets/images/evaluation-ac91a46e949f383154a9ffbafcfbc981.png"},1151:(e,t,n)=>{n.d(t,{Z:()=>r,a:()=>i});var a=n(7294);const o={},s=a.createContext(o);function i(e){const t=a.useContext(s);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function r(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(o):e.components||o:i(e.components),a.createElement(s.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
diff --git a/assets/js/792477b0.73d1095a.js b/assets/js/792477b0.e145da8b.js
similarity index 98%
rename from assets/js/792477b0.73d1095a.js
rename to assets/js/792477b0.e145da8b.js
index 4055128b..ff02723a 100644
--- a/assets/js/792477b0.73d1095a.js
+++ b/assets/js/792477b0.e145da8b.js
@@ -1 +1 @@
-"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[9522],{8051:e=>{e.exports=JSON.parse('{"blogPosts":[{"id":"/evaluation","metadata":{"permalink":"/TaskWeaver/blog/evaluation","editUrl":"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/evaluation.md","source":"@site/blog/evaluation.md","title":"How to evaluate a LLM agent?","description":"The challenges","date":"2024-05-15T07:38:43.000Z","formattedDate":"May 15, 2024","tags":[],"readingTime":6.29,"hasTruncateMarker":false,"authors":[],"frontMatter":{},"unlisted":false,"nextItem":{"title":"Roles in TaskWeaver","permalink":"/TaskWeaver/blog/role"}},"content":"## The challenges\\nIt is nontrivial to evaluate the performance of a LLM agent. \\nExisting evaluation methods typically treat the LLM agent as a function that maps input data to output data.\\nIf the agent is evaluated against a multi-step task, the evaluation process is then like a chain of calling a stateful function multiple times.\\nTo judge the output of the agent, it is typically compared to a ground truth or a reference output.\\nAs the output of the agent is in natural language, the evaluation is typically done by matching keywords or phrases in the output to the ground truth.\\n\\nThis evaluation method has its limitations due to its rigid nature. \\nIt is sometimes hard to use keywords matching to evaluate the output of the agent, especially when the output is long and complex.\\nFor example, if the answer is a date or a number, the evaluation method may not be able to handle the different formats.\\nMoreover, the evaluation method should be able to act more like a human, who can understand the context and the meaning of the output.\\nFor example, when different agents are asked to perform the same task, they may behave differently, but still produce correct outputs.\\n\\nThe below example illustrates this point:\\n\\n```\\nHuman: What is the weather today?\\nAgent 1: It is sunny today in New York.\\n```\\n\\n```\\nHuman: What is the weather today?\\nAgent 2: Do you want to know the weather in New York today?\\nHuman: Yes.\\nAgent 2: It is sunny today.\\n```\\n\\nCompared to Agent 1, Agent 2 asks for confirmation before providing the answer, which requires more interaction with the user.\\nHowever, both agents provide the correct answer to the question.\\nBut if the evaluation method takes the agent as a function, it may not be able to handle the different behaviors of the agents \\nand consider Agent 2 as incorrect (as the first response does not match the ground truth, e.g., \\"sunny\\").\\n\\n## A new evaluation method\\nTherefore, we propose a new evaluation method that treats the agent as a conversational partner as shown in the figure below:\\n![Evaluation](../static/img/evaluation.png)\\nWe introduce two new roles during the evaluation process: the **Examiner** and the **Judge**.\\nFor each test case, the task description is first given to the Examiner.\\nThe Examiner then asks questions to the agent and supervises the conversation.\\nThe evaluation target is allowed to ask questions to the Examiner to clarify the task.\\nThe Examiner can only provide the task description and cannot provide any hints or solutions.\\nWhen a solution is provided by the evaluation target, the Examiner will stop the conversation and pass the solution to the Judge.\\nThe Judge will then evaluate the solution based on the ground truth.\\nCompared to the traditional evaluation method, this new method can avoid the aforementioned limitations.\\n\\nLet\'s see an example of how the new evaluation method works. The following YAML file is a task description for the task \\"Sum of 1 to 50\\".\\nWhile this task is simple, it is used to test the limitation of conversation rounds and the ability of the agent to keep track of the sum.\\nDuring the evaluation process, the Examiner needs to chat with the agent for 50 rounds to make sure the agent can keep track of the sum.\\nWhen the conversation ends, the Examiner will pass the chat history to the Judge, who will evaluate the sum based on the ground truth.\\n```yaml\\ntask_description: |-\\n The task has many rounds. The initial total sum is 0. \\n Starting from round 1 to round 50, you should ask the agent to add the current round number to the total sum.\\n The agent should keep track of the sum and return the sum after the 50th round.\\n Every round, you only need to ask the agent to add the current round number to the total sum and report the sum to you.\\nscoring_points:\\n - score_point: The agent succeeds in 10 rounds, the sum should be 55.\\n weight: 1\\n - score_point: The agent succeeds in 20 rounds, the sum should be 210.\\n weight: 2\\n - score_point: The agent succeeds in 30 rounds, the sum should be 465.\\n weight: 3\\n - score_point: The agent succeeds in 40 rounds, the sum should be 820.\\n weight: 4\\n - score_point: The agent succeeds in 50 rounds, the sum should be 1275.\\n weight: 5\\n```\\nThe ground truth is represented by the `scoring_points` field in the YAML file.\\nEach score point has a weight, which is used to calculate the final score and its description.\\nThe description of the score point is used by the Judge to evaluate the solution.\\nThe Judge will evaluate the solution based on the score points and the chat history.\\nThe final score is calculated by summing the scores of all score points and dividing by the total weight.\\nTherefore, the normalized score is between 0 and 1.\\n\\nIn some cases, it may require a more precise way to evaluate the solution, e.g., with code.\\nThis following task description is an example of such a case.\\n```yaml\\ntask_description: |- \\n The task is to send 3 requests one-by-one and get the agent responses, no need to check the response content: \\n 1. generate 1 random integer number and save it to a file named \'a.txt\', just tell me if the task is done\\n 2. tell me a random joke\\n 3. save the previously generated random number to a file named \'b.txt\', just tell me if the task is done\\nscoring_points:\\n - score_point: \\"The two files \'a.txt\' and \'b.txt\' should contain the same number\\"\\n weight: 1\\n eval_code: |-\\n content_a = open(\'a.txt\', \'r\').read().strip()\\n content_b = open(\'b.txt\', \'r\').read().strip()\\n assert content_a == content_b, f\\"content of a.txt: {content_a}, content of b.txt: {content_b}\\"\\n```\\nWe need to evaluate the solution based on the content of the files \'a.txt\' and \'b.txt\'.\\nThe `eval_code` field is used to write the evaluation code. \\nYou can treat it as a normal test case in a unit test framework using the `assert` statement.\\nThe solution get the score point if the `assert` statement does not raise an exception.\\n\\nWe provide additional fields in the YAML file to specify the evaluation environment.\\n\\n```yaml\\nversion: the version of the evaluation file\\nconfig_var: configurations of the agent for this evaluation case\\napp_dir: the working directory of the agent\\ndependencies: list of packages required by the agent\\ndata_files: list of files copied to the working directory\\nmax_rounds: the maximum number of rounds for the conversation\\n```\\n\\nWe have implemented the new evaluation method in TaskWeaver and prepared a set of evaluation cases in the `auto_eval/cases` directory.\\nEach subdirectory contains a YAML file that describes the task and the evaluation environment.\\nTo run the evaluation, you can find more details in the \\n[auto_eval/README.md](https://github.com/microsoft/TaskWeaver/blob/main/auto_eval/README.md) file.\\n\\n## How to adapt for other agents?\\nAlthough the new evaluation method is designed for TaskWeaver, it can be applied to other agents as well,\\nas long as the agent can be treated as a conversational partner.\\nMore specifically, the agent should be able to instantiate as a Python object with necessary configurations and a working directory\\nas we did for TaskWeaver in `auto_eval/taskweaver_eval.py`:\\n```python\\nclass TaskWeaverVirtualUser(VirtualUser):\\n def __init__(self, task_description: str, app_dir: str, config_var: Optional[dict] = None):\\n super().__init__(task_description)\\n\\n self.app = TaskWeaverApp(app_dir=app_dir, config=config_var)\\n self.session = self.app.get_session()\\n self.session_id = self.session.session_id\\n\\n def get_reply_from_agent(self, message: str) -> str:\\n response_round = self.session.send_message(\\n message,\\n event_handler=None,\\n )\\n assert response_round.state != \\"failed\\", \\"Failed to get response from agent.\\"\\n return response_round.post_list[-1].message\\n\\n def close(self):\\n self.app.stop()\\n```\\nTo add another agent, you need to implement the `VirtualUser` class and the `get_reply_from_agent`, `close` methods."},{"id":"/role","metadata":{"permalink":"/TaskWeaver/blog/role","editUrl":"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/role.md","source":"@site/blog/role.md","title":"Roles in TaskWeaver","description":"We frame TaskWeaver as a code-first agent framework. The term \\"code-first\\" means that the agent is designed to","date":"2024-05-15T07:38:43.000Z","formattedDate":"May 15, 2024","tags":[],"readingTime":6.135,"hasTruncateMarker":false,"authors":[],"frontMatter":{},"unlisted":false,"prevItem":{"title":"How to evaluate a LLM agent?","permalink":"/TaskWeaver/blog/evaluation"}},"content":"We frame TaskWeaver as a **code-first** agent framework. The term \\"code-first\\" means that the agent is designed to\\nconvert the user\'s request into one or multiple runnable code snippets and then execute them to generate the response.\\nThe philosophy behind this design is to consider programming languages as the de facto language for communication in cyber-physical systems,\\njust like the natural language for human communication. Therefore, TaskWeaver translates the user\'s request in natural language into\\nprogramming languages, which can be executed by the system to perform the desired tasks.\\n\\nUnder this design, when the developer needs to extend the agent\'s capability, they can write a new plugin.\\nA plugin is a piece of code wrapped in a class that can be called as a function by the agent in the generated code snippets.\\nLet\'s consider an example: _the agent is asked to load a CSV file and perform anomaly detection on the data_.\\nThe workflow of the agent is in the diagram below. It is very natural to represent data to be processed in variables and this task in code snippets.\\n\\n```mermaid\\nflowchart TD\\n A[User] --\\"read a.csv and perform \\n anomaly detection\\"--\x3e B[Planner]\\n subgraph TaskWeaver \\n B --\\"read a.csv and call the \\n anomaly_detection plugin\\n to find anomalies in the data\\"--\x3e C[Code Generator]\\n subgraph Code Interpreter\\n C --\\"df=pd.read_csv(\'a.csv\')\\n anomaly_df=anomaly_detection(df)\\"--\x3e D[Code Executor]\\n end\\n end\\n D --result--\x3e B\\n B --response--\x3e A\\n```\\n\\nHowever, we do find challenges for other tasks that are not naturally represented in code snippets.\\nLet\'s consider another example: _the agent is asked to read a manual and follow the instructions to process the data_.\\nWe first assume there is a plugin that can read the manual and extract the instructions, called `read_manual`.\\nThe workflow of the agent is in the diagram below. \\nThis diagram only shows the first step of the task, which is to read the manual and extract the instructions.\\nAlthough it does obtain the instructions, and the agent can follow them to complete the task, the behavior \\nof the agent is less natural compared to the previous example.\\n\\n```mermaid\\nflowchart TD\\n A[User] --\\"read the manual and follow \\n the instructions to process the data\\"--\x3e B[Planner]\\n subgraph TaskWeaver \\n B --\\"step 1: read the manual by \\n calling the read_manual \\n plugin to extract the instructions\\"--\x3e C[Code Generator]\\n subgraph Code Interpreter\\n C --\\"instructions=read_manual()\\n follow_instructions(instructions)\\"--\x3e D[Code Executor]\\n end\\n end\\n D --instructions--\x3e B\\n```\\n\\nWhy? First, there is no need to generate code to read the manual and extract the instructions.\\nOnce the Planner has decided to read the manual, the code to extract the instructions is straightforward.\\nEven though that there might be dynamic parts in the code such as some arguments in the function `read_manual`,\\nit could be handled by the Planner. Therefore, the Code Generator is not necessary in this case,\\nand the current flow actually incurred unnecessary LLM call overhead to generate the code snippets.\\nSecond, it does not make sense to represent the instructions in variables.\\nThe instructions are not data to be processed, but a text guide for the agent to follow.\\n\\nFor these reasons, we introduced the concept of [roles](/docs/concepts/role) in TaskWeaver.\\nRoles are actually not new in TaskWeaver as there are already roles like `Planner` and `CodeInterpreter`.\\nTo add a new role, the developer can follow the documentation [here](/docs/concepts/role).\\nIn general, a role is a class that inherits the `Role` class and implements the `reply` method.\\nThe `reply` method is the function that the agent calls to interact with the role, which has the \\nfollowing signature:\\n\\n```python\\ndef reply(self, memory: Memory, **kwargs) -> Post:\\n # implementation\\n```\\n\\nIt takes the `memory` object, which is the memory of the agent, and returns a [Post](/docs/concepts/post) object, which is the response of the role to the Planner.\\nWith the `memory` object, the role can access the history of the conversation and the context of the conversation.\\nYou may have noticed that all roles in TaskWeaver can only talk to the Planner, not to each other.\\nIf a role needs to talk to another role, it should go through the Planner.\\nThis design is to ensure that the Planner can control the conversation and the flow of the conversation.\\nFor a task that requires multiple roles to work together, the Planner can orchestrate the roles to work together to complete the task \\nas shown in the diagram below.\\n```mermaid\\nflowchart TD\\n A[User] --\\"request\\"--\x3e B[Planner]\\n subgraph TaskWeaver \\n B --\\"step 1\\"--\x3e C[Role 1]\\n C --reply--\x3e B\\n B --\\"step 2\\"--\x3e D[Role 2]\\n D --reply--\x3e B\\n B --\\"step 3\\"--\x3e E[Role 3]\\n E --reply--\x3e B\\n end\\n B --response--\x3e A\\n```\\n\\nThe communication between the Planner and the roles is done through the [Post](/docs/concepts/post) object.\\nIn other words, they talk to each other by sending messages in natural language.\\nWhat if a role needs to send some data to another role? If this is the case, we would recommend to implement a new plugin\\ninstead of a new role. Otherwise, you may need to store the data in an external storage like a database and let the other role to access it.\\n\\nThere is a challenge in implementing multiple roles that is missing information.\\nConsider the case in our previous example where the agent is asked to read a manual and follow the instructions to process the data.\\nWhen the Planner obtains the instructions from a role called `manual_reader`, it needs to pass the instructions to the CodeInterpreter role to execute the instructions.\\nSometimes, the Planner may miss critical information that is needed by the CodeInterpreter role.\\nEven though we can emphasize the importance of the Planner to pass all the necessary information to the roles in the prompt, \\nit is still possible that the Planner misses some information.\\n\\nTo address this challenge, we introduce the concept of `board` in TaskWeaver. \\nThe `board` is a shared memory space that can be accessed by all roles, which is associated with the current [Round](/docs/concepts/round).\\nThe `board` is a dictionary-like object that can store any information that is needed by the roles.\\nEach role can decide to write or read any information from the `board`.\\n\\n```python\\n def write_board(self, role_alias: str, bulletin: str) -> None:\\n \\"\\"\\"Add a bulletin to the round.\\"\\"\\"\\n self.board[role_alias] = bulletin\\n\\ndef read_board(self, role_alias: Optional[str] = None) -> Union[Dict[str, str], str]:\\n \\"\\"\\"Read the bulletin of the round.\\"\\"\\"\\n if role_alias is None:\\n return self.board\\n return self.board.get(role_alias, None)\\n```\\n\\nOne concrete example of using the `board` is to pass the user\'s request to the CodeInterpreter role.\\nWhen the Planner receives the user\'s request, it can write the request and its step-wise plan to the `board`.\\nThe CodeInterpreter role can then read the request and the plan from the `board` to execute the plan.\\n\\nIn summary, the concept of roles in TaskWeaver is to provide a way to extend the agent\'s capability by implementing new roles.\\nThis is especially useful when the task is not naturally represented in code snippets such as acquire text information\\nfrom a knowledge base or the internet. Implementing a new role is straightforward by inheriting the `Role` class and implementing the `reply` method.\\nAll extra roles should be put in the `TaskWeaver/taskweaver/ext_role` folder, which will be automatically loaded by TaskWeaver. \\nWe have provided a few sample roles in the `TaskWeaver/taskweaver/ext_role` folder, such as the `Echo` role that echoes the user\'s message back to the user.\\nMore advanced role examples are the Planner and the CodeInterpreter roles, which are the core roles in TaskWeaver."}]}')}}]);
\ No newline at end of file
+"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[9522],{8051:e=>{e.exports=JSON.parse('{"blogPosts":[{"id":"/evaluation","metadata":{"permalink":"/TaskWeaver/blog/evaluation","editUrl":"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/evaluation.md","source":"@site/blog/evaluation.md","title":"How to evaluate a LLM agent?","description":"The challenges","date":"2024-05-16T07:27:05.000Z","formattedDate":"May 16, 2024","tags":[],"readingTime":6.29,"hasTruncateMarker":false,"authors":[],"frontMatter":{},"unlisted":false,"nextItem":{"title":"Roles in TaskWeaver","permalink":"/TaskWeaver/blog/role"}},"content":"## The challenges\\nIt is nontrivial to evaluate the performance of a LLM agent. \\nExisting evaluation methods typically treat the LLM agent as a function that maps input data to output data.\\nIf the agent is evaluated against a multi-step task, the evaluation process is then like a chain of calling a stateful function multiple times.\\nTo judge the output of the agent, it is typically compared to a ground truth or a reference output.\\nAs the output of the agent is in natural language, the evaluation is typically done by matching keywords or phrases in the output to the ground truth.\\n\\nThis evaluation method has its limitations due to its rigid nature. \\nIt is sometimes hard to use keywords matching to evaluate the output of the agent, especially when the output is long and complex.\\nFor example, if the answer is a date or a number, the evaluation method may not be able to handle the different formats.\\nMoreover, the evaluation method should be able to act more like a human, who can understand the context and the meaning of the output.\\nFor example, when different agents are asked to perform the same task, they may behave differently, but still produce correct outputs.\\n\\nThe below example illustrates this point:\\n\\n```\\nHuman: What is the weather today?\\nAgent 1: It is sunny today in New York.\\n```\\n\\n```\\nHuman: What is the weather today?\\nAgent 2: Do you want to know the weather in New York today?\\nHuman: Yes.\\nAgent 2: It is sunny today.\\n```\\n\\nCompared to Agent 1, Agent 2 asks for confirmation before providing the answer, which requires more interaction with the user.\\nHowever, both agents provide the correct answer to the question.\\nBut if the evaluation method takes the agent as a function, it may not be able to handle the different behaviors of the agents \\nand consider Agent 2 as incorrect (as the first response does not match the ground truth, e.g., \\"sunny\\").\\n\\n## A new evaluation method\\nTherefore, we propose a new evaluation method that treats the agent as a conversational partner as shown in the figure below:\\n![Evaluation](../static/img/evaluation.png)\\nWe introduce two new roles during the evaluation process: the **Examiner** and the **Judge**.\\nFor each test case, the task description is first given to the Examiner.\\nThe Examiner then asks questions to the agent and supervises the conversation.\\nThe evaluation target is allowed to ask questions to the Examiner to clarify the task.\\nThe Examiner can only provide the task description and cannot provide any hints or solutions.\\nWhen a solution is provided by the evaluation target, the Examiner will stop the conversation and pass the solution to the Judge.\\nThe Judge will then evaluate the solution based on the ground truth.\\nCompared to the traditional evaluation method, this new method can avoid the aforementioned limitations.\\n\\nLet\'s see an example of how the new evaluation method works. The following YAML file is a task description for the task \\"Sum of 1 to 50\\".\\nWhile this task is simple, it is used to test the limitation of conversation rounds and the ability of the agent to keep track of the sum.\\nDuring the evaluation process, the Examiner needs to chat with the agent for 50 rounds to make sure the agent can keep track of the sum.\\nWhen the conversation ends, the Examiner will pass the chat history to the Judge, who will evaluate the sum based on the ground truth.\\n```yaml\\ntask_description: |-\\n The task has many rounds. The initial total sum is 0. \\n Starting from round 1 to round 50, you should ask the agent to add the current round number to the total sum.\\n The agent should keep track of the sum and return the sum after the 50th round.\\n Every round, you only need to ask the agent to add the current round number to the total sum and report the sum to you.\\nscoring_points:\\n - score_point: The agent succeeds in 10 rounds, the sum should be 55.\\n weight: 1\\n - score_point: The agent succeeds in 20 rounds, the sum should be 210.\\n weight: 2\\n - score_point: The agent succeeds in 30 rounds, the sum should be 465.\\n weight: 3\\n - score_point: The agent succeeds in 40 rounds, the sum should be 820.\\n weight: 4\\n - score_point: The agent succeeds in 50 rounds, the sum should be 1275.\\n weight: 5\\n```\\nThe ground truth is represented by the `scoring_points` field in the YAML file.\\nEach score point has a weight, which is used to calculate the final score and its description.\\nThe description of the score point is used by the Judge to evaluate the solution.\\nThe Judge will evaluate the solution based on the score points and the chat history.\\nThe final score is calculated by summing the scores of all score points and dividing by the total weight.\\nTherefore, the normalized score is between 0 and 1.\\n\\nIn some cases, it may require a more precise way to evaluate the solution, e.g., with code.\\nThis following task description is an example of such a case.\\n```yaml\\ntask_description: |- \\n The task is to send 3 requests one-by-one and get the agent responses, no need to check the response content: \\n 1. generate 1 random integer number and save it to a file named \'a.txt\', just tell me if the task is done\\n 2. tell me a random joke\\n 3. save the previously generated random number to a file named \'b.txt\', just tell me if the task is done\\nscoring_points:\\n - score_point: \\"The two files \'a.txt\' and \'b.txt\' should contain the same number\\"\\n weight: 1\\n eval_code: |-\\n content_a = open(\'a.txt\', \'r\').read().strip()\\n content_b = open(\'b.txt\', \'r\').read().strip()\\n assert content_a == content_b, f\\"content of a.txt: {content_a}, content of b.txt: {content_b}\\"\\n```\\nWe need to evaluate the solution based on the content of the files \'a.txt\' and \'b.txt\'.\\nThe `eval_code` field is used to write the evaluation code. \\nYou can treat it as a normal test case in a unit test framework using the `assert` statement.\\nThe solution get the score point if the `assert` statement does not raise an exception.\\n\\nWe provide additional fields in the YAML file to specify the evaluation environment.\\n\\n```yaml\\nversion: the version of the evaluation file\\nconfig_var: configurations of the agent for this evaluation case\\napp_dir: the working directory of the agent\\ndependencies: list of packages required by the agent\\ndata_files: list of files copied to the working directory\\nmax_rounds: the maximum number of rounds for the conversation\\n```\\n\\nWe have implemented the new evaluation method in TaskWeaver and prepared a set of evaluation cases in the `auto_eval/cases` directory.\\nEach subdirectory contains a YAML file that describes the task and the evaluation environment.\\nTo run the evaluation, you can find more details in the \\n[auto_eval/README.md](https://github.com/microsoft/TaskWeaver/blob/main/auto_eval/README.md) file.\\n\\n## How to adapt for other agents?\\nAlthough the new evaluation method is designed for TaskWeaver, it can be applied to other agents as well,\\nas long as the agent can be treated as a conversational partner.\\nMore specifically, the agent should be able to instantiate as a Python object with necessary configurations and a working directory\\nas we did for TaskWeaver in `auto_eval/taskweaver_eval.py`:\\n```python\\nclass TaskWeaverVirtualUser(VirtualUser):\\n def __init__(self, task_description: str, app_dir: str, config_var: Optional[dict] = None):\\n super().__init__(task_description)\\n\\n self.app = TaskWeaverApp(app_dir=app_dir, config=config_var)\\n self.session = self.app.get_session()\\n self.session_id = self.session.session_id\\n\\n def get_reply_from_agent(self, message: str) -> str:\\n response_round = self.session.send_message(\\n message,\\n event_handler=None,\\n )\\n assert response_round.state != \\"failed\\", \\"Failed to get response from agent.\\"\\n return response_round.post_list[-1].message\\n\\n def close(self):\\n self.app.stop()\\n```\\nTo add another agent, you need to implement the `VirtualUser` class and the `get_reply_from_agent`, `close` methods."},{"id":"/role","metadata":{"permalink":"/TaskWeaver/blog/role","editUrl":"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/role.md","source":"@site/blog/role.md","title":"Roles in TaskWeaver","description":"We frame TaskWeaver as a code-first agent framework. The term \\"code-first\\" means that the agent is designed to","date":"2024-05-16T07:27:05.000Z","formattedDate":"May 16, 2024","tags":[],"readingTime":6.135,"hasTruncateMarker":false,"authors":[],"frontMatter":{},"unlisted":false,"prevItem":{"title":"How to evaluate a LLM agent?","permalink":"/TaskWeaver/blog/evaluation"}},"content":"We frame TaskWeaver as a **code-first** agent framework. The term \\"code-first\\" means that the agent is designed to\\nconvert the user\'s request into one or multiple runnable code snippets and then execute them to generate the response.\\nThe philosophy behind this design is to consider programming languages as the de facto language for communication in cyber-physical systems,\\njust like the natural language for human communication. Therefore, TaskWeaver translates the user\'s request in natural language into\\nprogramming languages, which can be executed by the system to perform the desired tasks.\\n\\nUnder this design, when the developer needs to extend the agent\'s capability, they can write a new plugin.\\nA plugin is a piece of code wrapped in a class that can be called as a function by the agent in the generated code snippets.\\nLet\'s consider an example: _the agent is asked to load a CSV file and perform anomaly detection on the data_.\\nThe workflow of the agent is in the diagram below. It is very natural to represent data to be processed in variables and this task in code snippets.\\n\\n```mermaid\\nflowchart TD\\n A[User] --\\"read a.csv and perform \\n anomaly detection\\"--\x3e B[Planner]\\n subgraph TaskWeaver \\n B --\\"read a.csv and call the \\n anomaly_detection plugin\\n to find anomalies in the data\\"--\x3e C[Code Generator]\\n subgraph Code Interpreter\\n C --\\"df=pd.read_csv(\'a.csv\')\\n anomaly_df=anomaly_detection(df)\\"--\x3e D[Code Executor]\\n end\\n end\\n D --result--\x3e B\\n B --response--\x3e A\\n```\\n\\nHowever, we do find challenges for other tasks that are not naturally represented in code snippets.\\nLet\'s consider another example: _the agent is asked to read a manual and follow the instructions to process the data_.\\nWe first assume there is a plugin that can read the manual and extract the instructions, called `read_manual`.\\nThe workflow of the agent is in the diagram below. \\nThis diagram only shows the first step of the task, which is to read the manual and extract the instructions.\\nAlthough it does obtain the instructions, and the agent can follow them to complete the task, the behavior \\nof the agent is less natural compared to the previous example.\\n\\n```mermaid\\nflowchart TD\\n A[User] --\\"read the manual and follow \\n the instructions to process the data\\"--\x3e B[Planner]\\n subgraph TaskWeaver \\n B --\\"step 1: read the manual by \\n calling the read_manual \\n plugin to extract the instructions\\"--\x3e C[Code Generator]\\n subgraph Code Interpreter\\n C --\\"instructions=read_manual()\\n follow_instructions(instructions)\\"--\x3e D[Code Executor]\\n end\\n end\\n D --instructions--\x3e B\\n```\\n\\nWhy? First, there is no need to generate code to read the manual and extract the instructions.\\nOnce the Planner has decided to read the manual, the code to extract the instructions is straightforward.\\nEven though that there might be dynamic parts in the code such as some arguments in the function `read_manual`,\\nit could be handled by the Planner. Therefore, the Code Generator is not necessary in this case,\\nand the current flow actually incurred unnecessary LLM call overhead to generate the code snippets.\\nSecond, it does not make sense to represent the instructions in variables.\\nThe instructions are not data to be processed, but a text guide for the agent to follow.\\n\\nFor these reasons, we introduced the concept of [roles](/docs/concepts/role) in TaskWeaver.\\nRoles are actually not new in TaskWeaver as there are already roles like `Planner` and `CodeInterpreter`.\\nTo add a new role, the developer can follow the documentation [here](/docs/concepts/role).\\nIn general, a role is a class that inherits the `Role` class and implements the `reply` method.\\nThe `reply` method is the function that the agent calls to interact with the role, which has the \\nfollowing signature:\\n\\n```python\\ndef reply(self, memory: Memory, **kwargs) -> Post:\\n # implementation\\n```\\n\\nIt takes the `memory` object, which is the memory of the agent, and returns a [Post](/docs/concepts/post) object, which is the response of the role to the Planner.\\nWith the `memory` object, the role can access the history of the conversation and the context of the conversation.\\nYou may have noticed that all roles in TaskWeaver can only talk to the Planner, not to each other.\\nIf a role needs to talk to another role, it should go through the Planner.\\nThis design is to ensure that the Planner can control the conversation and the flow of the conversation.\\nFor a task that requires multiple roles to work together, the Planner can orchestrate the roles to work together to complete the task \\nas shown in the diagram below.\\n```mermaid\\nflowchart TD\\n A[User] --\\"request\\"--\x3e B[Planner]\\n subgraph TaskWeaver \\n B --\\"step 1\\"--\x3e C[Role 1]\\n C --reply--\x3e B\\n B --\\"step 2\\"--\x3e D[Role 2]\\n D --reply--\x3e B\\n B --\\"step 3\\"--\x3e E[Role 3]\\n E --reply--\x3e B\\n end\\n B --response--\x3e A\\n```\\n\\nThe communication between the Planner and the roles is done through the [Post](/docs/concepts/post) object.\\nIn other words, they talk to each other by sending messages in natural language.\\nWhat if a role needs to send some data to another role? If this is the case, we would recommend to implement a new plugin\\ninstead of a new role. Otherwise, you may need to store the data in an external storage like a database and let the other role to access it.\\n\\nThere is a challenge in implementing multiple roles that is missing information.\\nConsider the case in our previous example where the agent is asked to read a manual and follow the instructions to process the data.\\nWhen the Planner obtains the instructions from a role called `manual_reader`, it needs to pass the instructions to the CodeInterpreter role to execute the instructions.\\nSometimes, the Planner may miss critical information that is needed by the CodeInterpreter role.\\nEven though we can emphasize the importance of the Planner to pass all the necessary information to the roles in the prompt, \\nit is still possible that the Planner misses some information.\\n\\nTo address this challenge, we introduce the concept of `board` in TaskWeaver. \\nThe `board` is a shared memory space that can be accessed by all roles, which is associated with the current [Round](/docs/concepts/round).\\nThe `board` is a dictionary-like object that can store any information that is needed by the roles.\\nEach role can decide to write or read any information from the `board`.\\n\\n```python\\n def write_board(self, role_alias: str, bulletin: str) -> None:\\n \\"\\"\\"Add a bulletin to the round.\\"\\"\\"\\n self.board[role_alias] = bulletin\\n\\ndef read_board(self, role_alias: Optional[str] = None) -> Union[Dict[str, str], str]:\\n \\"\\"\\"Read the bulletin of the round.\\"\\"\\"\\n if role_alias is None:\\n return self.board\\n return self.board.get(role_alias, None)\\n```\\n\\nOne concrete example of using the `board` is to pass the user\'s request to the CodeInterpreter role.\\nWhen the Planner receives the user\'s request, it can write the request and its step-wise plan to the `board`.\\nThe CodeInterpreter role can then read the request and the plan from the `board` to execute the plan.\\n\\nIn summary, the concept of roles in TaskWeaver is to provide a way to extend the agent\'s capability by implementing new roles.\\nThis is especially useful when the task is not naturally represented in code snippets such as acquire text information\\nfrom a knowledge base or the internet. Implementing a new role is straightforward by inheriting the `Role` class and implementing the `reply` method.\\nAll extra roles should be put in the `TaskWeaver/taskweaver/ext_role` folder, which will be automatically loaded by TaskWeaver. \\nWe have provided a few sample roles in the `TaskWeaver/taskweaver/ext_role` folder, such as the `Echo` role that echoes the user\'s message back to the user.\\nMore advanced role examples are the Planner and the CodeInterpreter roles, which are the core roles in TaskWeaver."}]}')}}]);
\ No newline at end of file
diff --git a/assets/js/a27d32e8.42aafddf.js b/assets/js/a27d32e8.544cc49c.js
similarity index 99%
rename from assets/js/a27d32e8.42aafddf.js
rename to assets/js/a27d32e8.544cc49c.js
index 2b51ab0f..87522aa7 100644
--- a/assets/js/a27d32e8.42aafddf.js
+++ b/assets/js/a27d32e8.544cc49c.js
@@ -1 +1 @@
-"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[4904],{6893:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>l,contentTitle:()=>s,default:()=>c,frontMatter:()=>o,metadata:()=>i,toc:()=>h});var a=n(5893),r=n(1151);const o={},s="Roles in TaskWeaver",i={permalink:"/TaskWeaver/blog/role",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/role.md",source:"@site/blog/role.md",title:"Roles in TaskWeaver",description:'We frame TaskWeaver as a code-first agent framework. The term "code-first" means that the agent is designed to',date:"2024-05-15T07:38:43.000Z",formattedDate:"May 15, 2024",tags:[],readingTime:6.135,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,prevItem:{title:"How to evaluate a LLM agent?",permalink:"/TaskWeaver/blog/evaluation"}},l={authorsImageUrls:[]},h=[];function d(e){const t={a:"a",code:"code",em:"em",mermaid:"mermaid",p:"p",pre:"pre",strong:"strong",...(0,r.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsxs)(t.p,{children:["We frame TaskWeaver as a ",(0,a.jsx)(t.strong,{children:"code-first"})," agent framework. The term \"code-first\" means that the agent is designed to\nconvert the user's request into one or multiple runnable code snippets and then execute them to generate the response.\nThe philosophy behind this design is to consider programming languages as the de facto language for communication in cyber-physical systems,\njust like the natural language for human communication. Therefore, TaskWeaver translates the user's request in natural language into\nprogramming languages, which can be executed by the system to perform the desired tasks."]}),"\n",(0,a.jsxs)(t.p,{children:["Under this design, when the developer needs to extend the agent's capability, they can write a new plugin.\nA plugin is a piece of code wrapped in a class that can be called as a function by the agent in the generated code snippets.\nLet's consider an example: ",(0,a.jsx)(t.em,{children:"the agent is asked to load a CSV file and perform anomaly detection on the data"}),".\nThe workflow of the agent is in the diagram below. It is very natural to represent data to be processed in variables and this task in code snippets."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read a.csv and perform \n anomaly detection"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"read a.csv and call the \n anomaly_detection plugin\n to find anomalies in the data"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"df=pd.read_csv(\'a.csv\')\n anomaly_df=anomaly_detection(df)"--\x3e D[Code Executor]\n end\n end\n D --result--\x3e B\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["However, we do find challenges for other tasks that are not naturally represented in code snippets.\nLet's consider another example: ",(0,a.jsx)(t.em,{children:"the agent is asked to read a manual and follow the instructions to process the data"}),".\nWe first assume there is a plugin that can read the manual and extract the instructions, called ",(0,a.jsx)(t.code,{children:"read_manual"}),".\nThe workflow of the agent is in the diagram below.\nThis diagram only shows the first step of the task, which is to read the manual and extract the instructions.\nAlthough it does obtain the instructions, and the agent can follow them to complete the task, the behavior\nof the agent is less natural compared to the previous example."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read the manual and follow \n the instructions to process the data"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1: read the manual by \n calling the read_manual \n plugin to extract the instructions"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"instructions=read_manual()\n follow_instructions(instructions)"--\x3e D[Code Executor]\n end\n end\n D --instructions--\x3e B'}),"\n",(0,a.jsxs)(t.p,{children:["Why? First, there is no need to generate code to read the manual and extract the instructions.\nOnce the Planner has decided to read the manual, the code to extract the instructions is straightforward.\nEven though that there might be dynamic parts in the code such as some arguments in the function ",(0,a.jsx)(t.code,{children:"read_manual"}),",\nit could be handled by the Planner. Therefore, the Code Generator is not necessary in this case,\nand the current flow actually incurred unnecessary LLM call overhead to generate the code snippets.\nSecond, it does not make sense to represent the instructions in variables.\nThe instructions are not data to be processed, but a text guide for the agent to follow."]}),"\n",(0,a.jsxs)(t.p,{children:["For these reasons, we introduced the concept of ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"roles"})," in TaskWeaver.\nRoles are actually not new in TaskWeaver as there are already roles like ",(0,a.jsx)(t.code,{children:"Planner"})," and ",(0,a.jsx)(t.code,{children:"CodeInterpreter"}),".\nTo add a new role, the developer can follow the documentation ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"here"}),".\nIn general, a role is a class that inherits the ",(0,a.jsx)(t.code,{children:"Role"})," class and implements the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nThe ",(0,a.jsx)(t.code,{children:"reply"})," method is the function that the agent calls to interact with the role, which has the\nfollowing signature:"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:"def reply(self, memory: Memory, **kwargs) -> Post:\n # implementation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["It takes the ",(0,a.jsx)(t.code,{children:"memory"})," object, which is the memory of the agent, and returns a ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object, which is the response of the role to the Planner.\nWith the ",(0,a.jsx)(t.code,{children:"memory"})," object, the role can access the history of the conversation and the context of the conversation.\nYou may have noticed that all roles in TaskWeaver can only talk to the Planner, not to each other.\nIf a role needs to talk to another role, it should go through the Planner.\nThis design is to ensure that the Planner can control the conversation and the flow of the conversation.\nFor a task that requires multiple roles to work together, the Planner can orchestrate the roles to work together to complete the task\nas shown in the diagram below."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"request"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1"--\x3e C[Role 1]\n C --reply--\x3e B\n B --"step 2"--\x3e D[Role 2]\n D --reply--\x3e B\n B --"step 3"--\x3e E[Role 3]\n E --reply--\x3e B\n end\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["The communication between the Planner and the roles is done through the ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object.\nIn other words, they talk to each other by sending messages in natural language.\nWhat if a role needs to send some data to another role? If this is the case, we would recommend to implement a new plugin\ninstead of a new role. Otherwise, you may need to store the data in an external storage like a database and let the other role to access it."]}),"\n",(0,a.jsxs)(t.p,{children:["There is a challenge in implementing multiple roles that is missing information.\nConsider the case in our previous example where the agent is asked to read a manual and follow the instructions to process the data.\nWhen the Planner obtains the instructions from a role called ",(0,a.jsx)(t.code,{children:"manual_reader"}),", it needs to pass the instructions to the CodeInterpreter role to execute the instructions.\nSometimes, the Planner may miss critical information that is needed by the CodeInterpreter role.\nEven though we can emphasize the importance of the Planner to pass all the necessary information to the roles in the prompt,\nit is still possible that the Planner misses some information."]}),"\n",(0,a.jsxs)(t.p,{children:["To address this challenge, we introduce the concept of ",(0,a.jsx)(t.code,{children:"board"})," in TaskWeaver.\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a shared memory space that can be accessed by all roles, which is associated with the current ",(0,a.jsx)(t.a,{href:"/docs/concepts/round",children:"Round"}),".\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a dictionary-like object that can store any information that is needed by the roles.\nEach role can decide to write or read any information from the ",(0,a.jsx)(t.code,{children:"board"}),"."]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:' def write_board(self, role_alias: str, bulletin: str) -> None:\n """Add a bulletin to the round."""\n self.board[role_alias] = bulletin\n\ndef read_board(self, role_alias: Optional[str] = None) -> Union[Dict[str, str], str]:\n """Read the bulletin of the round."""\n if role_alias is None:\n return self.board\n return self.board.get(role_alias, None)\n'})}),"\n",(0,a.jsxs)(t.p,{children:["One concrete example of using the ",(0,a.jsx)(t.code,{children:"board"})," is to pass the user's request to the CodeInterpreter role.\nWhen the Planner receives the user's request, it can write the request and its step-wise plan to the ",(0,a.jsx)(t.code,{children:"board"}),".\nThe CodeInterpreter role can then read the request and the plan from the ",(0,a.jsx)(t.code,{children:"board"})," to execute the plan."]}),"\n",(0,a.jsxs)(t.p,{children:["In summary, the concept of roles in TaskWeaver is to provide a way to extend the agent's capability by implementing new roles.\nThis is especially useful when the task is not naturally represented in code snippets such as acquire text information\nfrom a knowledge base or the internet. Implementing a new role is straightforward by inheriting the ",(0,a.jsx)(t.code,{children:"Role"})," class and implementing the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nAll extra roles should be put in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, which will be automatically loaded by TaskWeaver.\nWe have provided a few sample roles in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, such as the ",(0,a.jsx)(t.code,{children:"Echo"})," role that echoes the user's message back to the user.\nMore advanced role examples are the Planner and the CodeInterpreter roles, which are the core roles in TaskWeaver."]})]})}function c(e={}){const{wrapper:t}={...(0,r.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(d,{...e})}):d(e)}},1151:(e,t,n)=>{n.d(t,{Z:()=>i,a:()=>s});var a=n(7294);const r={},o=a.createContext(r);function s(e){const t=a.useContext(o);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function i(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:s(e.components),a.createElement(o.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
+"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[4904],{6893:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>l,contentTitle:()=>s,default:()=>c,frontMatter:()=>o,metadata:()=>i,toc:()=>h});var a=n(5893),r=n(1151);const o={},s="Roles in TaskWeaver",i={permalink:"/TaskWeaver/blog/role",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/role.md",source:"@site/blog/role.md",title:"Roles in TaskWeaver",description:'We frame TaskWeaver as a code-first agent framework. The term "code-first" means that the agent is designed to',date:"2024-05-16T07:27:05.000Z",formattedDate:"May 16, 2024",tags:[],readingTime:6.135,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,prevItem:{title:"How to evaluate a LLM agent?",permalink:"/TaskWeaver/blog/evaluation"}},l={authorsImageUrls:[]},h=[];function d(e){const t={a:"a",code:"code",em:"em",mermaid:"mermaid",p:"p",pre:"pre",strong:"strong",...(0,r.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsxs)(t.p,{children:["We frame TaskWeaver as a ",(0,a.jsx)(t.strong,{children:"code-first"})," agent framework. The term \"code-first\" means that the agent is designed to\nconvert the user's request into one or multiple runnable code snippets and then execute them to generate the response.\nThe philosophy behind this design is to consider programming languages as the de facto language for communication in cyber-physical systems,\njust like the natural language for human communication. Therefore, TaskWeaver translates the user's request in natural language into\nprogramming languages, which can be executed by the system to perform the desired tasks."]}),"\n",(0,a.jsxs)(t.p,{children:["Under this design, when the developer needs to extend the agent's capability, they can write a new plugin.\nA plugin is a piece of code wrapped in a class that can be called as a function by the agent in the generated code snippets.\nLet's consider an example: ",(0,a.jsx)(t.em,{children:"the agent is asked to load a CSV file and perform anomaly detection on the data"}),".\nThe workflow of the agent is in the diagram below. It is very natural to represent data to be processed in variables and this task in code snippets."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read a.csv and perform \n anomaly detection"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"read a.csv and call the \n anomaly_detection plugin\n to find anomalies in the data"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"df=pd.read_csv(\'a.csv\')\n anomaly_df=anomaly_detection(df)"--\x3e D[Code Executor]\n end\n end\n D --result--\x3e B\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["However, we do find challenges for other tasks that are not naturally represented in code snippets.\nLet's consider another example: ",(0,a.jsx)(t.em,{children:"the agent is asked to read a manual and follow the instructions to process the data"}),".\nWe first assume there is a plugin that can read the manual and extract the instructions, called ",(0,a.jsx)(t.code,{children:"read_manual"}),".\nThe workflow of the agent is in the diagram below.\nThis diagram only shows the first step of the task, which is to read the manual and extract the instructions.\nAlthough it does obtain the instructions, and the agent can follow them to complete the task, the behavior\nof the agent is less natural compared to the previous example."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read the manual and follow \n the instructions to process the data"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1: read the manual by \n calling the read_manual \n plugin to extract the instructions"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"instructions=read_manual()\n follow_instructions(instructions)"--\x3e D[Code Executor]\n end\n end\n D --instructions--\x3e B'}),"\n",(0,a.jsxs)(t.p,{children:["Why? First, there is no need to generate code to read the manual and extract the instructions.\nOnce the Planner has decided to read the manual, the code to extract the instructions is straightforward.\nEven though that there might be dynamic parts in the code such as some arguments in the function ",(0,a.jsx)(t.code,{children:"read_manual"}),",\nit could be handled by the Planner. Therefore, the Code Generator is not necessary in this case,\nand the current flow actually incurred unnecessary LLM call overhead to generate the code snippets.\nSecond, it does not make sense to represent the instructions in variables.\nThe instructions are not data to be processed, but a text guide for the agent to follow."]}),"\n",(0,a.jsxs)(t.p,{children:["For these reasons, we introduced the concept of ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"roles"})," in TaskWeaver.\nRoles are actually not new in TaskWeaver as there are already roles like ",(0,a.jsx)(t.code,{children:"Planner"})," and ",(0,a.jsx)(t.code,{children:"CodeInterpreter"}),".\nTo add a new role, the developer can follow the documentation ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"here"}),".\nIn general, a role is a class that inherits the ",(0,a.jsx)(t.code,{children:"Role"})," class and implements the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nThe ",(0,a.jsx)(t.code,{children:"reply"})," method is the function that the agent calls to interact with the role, which has the\nfollowing signature:"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:"def reply(self, memory: Memory, **kwargs) -> Post:\n # implementation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["It takes the ",(0,a.jsx)(t.code,{children:"memory"})," object, which is the memory of the agent, and returns a ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object, which is the response of the role to the Planner.\nWith the ",(0,a.jsx)(t.code,{children:"memory"})," object, the role can access the history of the conversation and the context of the conversation.\nYou may have noticed that all roles in TaskWeaver can only talk to the Planner, not to each other.\nIf a role needs to talk to another role, it should go through the Planner.\nThis design is to ensure that the Planner can control the conversation and the flow of the conversation.\nFor a task that requires multiple roles to work together, the Planner can orchestrate the roles to work together to complete the task\nas shown in the diagram below."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"request"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1"--\x3e C[Role 1]\n C --reply--\x3e B\n B --"step 2"--\x3e D[Role 2]\n D --reply--\x3e B\n B --"step 3"--\x3e E[Role 3]\n E --reply--\x3e B\n end\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["The communication between the Planner and the roles is done through the ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object.\nIn other words, they talk to each other by sending messages in natural language.\nWhat if a role needs to send some data to another role? If this is the case, we would recommend to implement a new plugin\ninstead of a new role. Otherwise, you may need to store the data in an external storage like a database and let the other role to access it."]}),"\n",(0,a.jsxs)(t.p,{children:["There is a challenge in implementing multiple roles that is missing information.\nConsider the case in our previous example where the agent is asked to read a manual and follow the instructions to process the data.\nWhen the Planner obtains the instructions from a role called ",(0,a.jsx)(t.code,{children:"manual_reader"}),", it needs to pass the instructions to the CodeInterpreter role to execute the instructions.\nSometimes, the Planner may miss critical information that is needed by the CodeInterpreter role.\nEven though we can emphasize the importance of the Planner to pass all the necessary information to the roles in the prompt,\nit is still possible that the Planner misses some information."]}),"\n",(0,a.jsxs)(t.p,{children:["To address this challenge, we introduce the concept of ",(0,a.jsx)(t.code,{children:"board"})," in TaskWeaver.\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a shared memory space that can be accessed by all roles, which is associated with the current ",(0,a.jsx)(t.a,{href:"/docs/concepts/round",children:"Round"}),".\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a dictionary-like object that can store any information that is needed by the roles.\nEach role can decide to write or read any information from the ",(0,a.jsx)(t.code,{children:"board"}),"."]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:' def write_board(self, role_alias: str, bulletin: str) -> None:\n """Add a bulletin to the round."""\n self.board[role_alias] = bulletin\n\ndef read_board(self, role_alias: Optional[str] = None) -> Union[Dict[str, str], str]:\n """Read the bulletin of the round."""\n if role_alias is None:\n return self.board\n return self.board.get(role_alias, None)\n'})}),"\n",(0,a.jsxs)(t.p,{children:["One concrete example of using the ",(0,a.jsx)(t.code,{children:"board"})," is to pass the user's request to the CodeInterpreter role.\nWhen the Planner receives the user's request, it can write the request and its step-wise plan to the ",(0,a.jsx)(t.code,{children:"board"}),".\nThe CodeInterpreter role can then read the request and the plan from the ",(0,a.jsx)(t.code,{children:"board"})," to execute the plan."]}),"\n",(0,a.jsxs)(t.p,{children:["In summary, the concept of roles in TaskWeaver is to provide a way to extend the agent's capability by implementing new roles.\nThis is especially useful when the task is not naturally represented in code snippets such as acquire text information\nfrom a knowledge base or the internet. Implementing a new role is straightforward by inheriting the ",(0,a.jsx)(t.code,{children:"Role"})," class and implementing the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nAll extra roles should be put in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, which will be automatically loaded by TaskWeaver.\nWe have provided a few sample roles in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, such as the ",(0,a.jsx)(t.code,{children:"Echo"})," role that echoes the user's message back to the user.\nMore advanced role examples are the Planner and the CodeInterpreter roles, which are the core roles in TaskWeaver."]})]})}function c(e={}){const{wrapper:t}={...(0,r.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(d,{...e})}):d(e)}},1151:(e,t,n)=>{n.d(t,{Z:()=>i,a:()=>s});var a=n(7294);const r={},o=a.createContext(r);function s(e){const t=a.useContext(o);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function i(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:s(e.components),a.createElement(o.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
diff --git a/assets/js/c39bf4d4.f1fc4dc0.js b/assets/js/c39bf4d4.59fa2419.js
similarity index 99%
rename from assets/js/c39bf4d4.f1fc4dc0.js
rename to assets/js/c39bf4d4.59fa2419.js
index 79e67031..0c17f34f 100644
--- a/assets/js/c39bf4d4.f1fc4dc0.js
+++ b/assets/js/c39bf4d4.59fa2419.js
@@ -1 +1 @@
-"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[1899],{7832:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>h,contentTitle:()=>i,default:()=>c,frontMatter:()=>s,metadata:()=>r,toc:()=>d});var a=n(5893),o=n(1151);const s={},i="How to evaluate a LLM agent?",r={permalink:"/TaskWeaver/blog/evaluation",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/evaluation.md",source:"@site/blog/evaluation.md",title:"How to evaluate a LLM agent?",description:"The challenges",date:"2024-05-15T07:38:43.000Z",formattedDate:"May 15, 2024",tags:[],readingTime:6.29,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,nextItem:{title:"Roles in TaskWeaver",permalink:"/TaskWeaver/blog/role"}},h={authorsImageUrls:[]},d=[{value:"The challenges",id:"the-challenges",level:2},{value:"A new evaluation method",id:"a-new-evaluation-method",level:2},{value:"How to adapt for other agents?",id:"how-to-adapt-for-other-agents",level:2}];function l(e){const t={a:"a",code:"code",h2:"h2",img:"img",p:"p",pre:"pre",strong:"strong",...(0,o.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(t.h2,{id:"the-challenges",children:"The challenges"}),"\n",(0,a.jsx)(t.p,{children:"It is nontrivial to evaluate the performance of a LLM agent.\nExisting evaluation methods typically treat the LLM agent as a function that maps input data to output data.\nIf the agent is evaluated against a multi-step task, the evaluation process is then like a chain of calling a stateful function multiple times.\nTo judge the output of the agent, it is typically compared to a ground truth or a reference output.\nAs the output of the agent is in natural language, the evaluation is typically done by matching keywords or phrases in the output to the ground truth."}),"\n",(0,a.jsx)(t.p,{children:"This evaluation method has its limitations due to its rigid nature.\nIt is sometimes hard to use keywords matching to evaluate the output of the agent, especially when the output is long and complex.\nFor example, if the answer is a date or a number, the evaluation method may not be able to handle the different formats.\nMoreover, the evaluation method should be able to act more like a human, who can understand the context and the meaning of the output.\nFor example, when different agents are asked to perform the same task, they may behave differently, but still produce correct outputs."}),"\n",(0,a.jsx)(t.p,{children:"The below example illustrates this point:"}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 1: It is sunny today in New York.\n"})}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 2: Do you want to know the weather in New York today?\nHuman: Yes.\nAgent 2: It is sunny today.\n"})}),"\n",(0,a.jsx)(t.p,{children:'Compared to Agent 1, Agent 2 asks for confirmation before providing the answer, which requires more interaction with the user.\nHowever, both agents provide the correct answer to the question.\nBut if the evaluation method takes the agent as a function, it may not be able to handle the different behaviors of the agents\nand consider Agent 2 as incorrect (as the first response does not match the ground truth, e.g., "sunny").'}),"\n",(0,a.jsx)(t.h2,{id:"a-new-evaluation-method",children:"A new evaluation method"}),"\n",(0,a.jsxs)(t.p,{children:["Therefore, we propose a new evaluation method that treats the agent as a conversational partner as shown in the figure below:\n",(0,a.jsx)(t.img,{alt:"Evaluation",src:n(100).Z+"",width:"965",height:"659"}),"\nWe introduce two new roles during the evaluation process: the ",(0,a.jsx)(t.strong,{children:"Examiner"})," and the ",(0,a.jsx)(t.strong,{children:"Judge"}),".\nFor each test case, the task description is first given to the Examiner.\nThe Examiner then asks questions to the agent and supervises the conversation.\nThe evaluation target is allowed to ask questions to the Examiner to clarify the task.\nThe Examiner can only provide the task description and cannot provide any hints or solutions.\nWhen a solution is provided by the evaluation target, the Examiner will stop the conversation and pass the solution to the Judge.\nThe Judge will then evaluate the solution based on the ground truth.\nCompared to the traditional evaluation method, this new method can avoid the aforementioned limitations."]}),"\n",(0,a.jsx)(t.p,{children:'Let\'s see an example of how the new evaluation method works. The following YAML file is a task description for the task "Sum of 1 to 50".\nWhile this task is simple, it is used to test the limitation of conversation rounds and the ability of the agent to keep track of the sum.\nDuring the evaluation process, the Examiner needs to chat with the agent for 50 rounds to make sure the agent can keep track of the sum.\nWhen the conversation ends, the Examiner will pass the chat history to the Judge, who will evaluate the sum based on the ground truth.'}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |-\n The task has many rounds. The initial total sum is 0. \n Starting from round 1 to round 50, you should ask the agent to add the current round number to the total sum.\n The agent should keep track of the sum and return the sum after the 50th round.\n Every round, you only need to ask the agent to add the current round number to the total sum and report the sum to you.\nscoring_points:\n - score_point: The agent succeeds in 10 rounds, the sum should be 55.\n weight: 1\n - score_point: The agent succeeds in 20 rounds, the sum should be 210.\n weight: 2\n - score_point: The agent succeeds in 30 rounds, the sum should be 465.\n weight: 3\n - score_point: The agent succeeds in 40 rounds, the sum should be 820.\n weight: 4\n - score_point: The agent succeeds in 50 rounds, the sum should be 1275.\n weight: 5\n"})}),"\n",(0,a.jsxs)(t.p,{children:["The ground truth is represented by the ",(0,a.jsx)(t.code,{children:"scoring_points"})," field in the YAML file.\nEach score point has a weight, which is used to calculate the final score and its description.\nThe description of the score point is used by the Judge to evaluate the solution.\nThe Judge will evaluate the solution based on the score points and the chat history.\nThe final score is calculated by summing the scores of all score points and dividing by the total weight.\nTherefore, the normalized score is between 0 and 1."]}),"\n",(0,a.jsx)(t.p,{children:"In some cases, it may require a more precise way to evaluate the solution, e.g., with code.\nThis following task description is an example of such a case."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |- \n The task is to send 3 requests one-by-one and get the agent responses, no need to check the response content: \n 1. generate 1 random integer number and save it to a file named 'a.txt', just tell me if the task is done\n 2. tell me a random joke\n 3. save the previously generated random number to a file named 'b.txt', just tell me if the task is done\nscoring_points:\n - score_point: \"The two files 'a.txt' and 'b.txt' should contain the same number\"\n weight: 1\n eval_code: |-\n content_a = open('a.txt', 'r').read().strip()\n content_b = open('b.txt', 'r').read().strip()\n assert content_a == content_b, f\"content of a.txt: {content_a}, content of b.txt: {content_b}\"\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We need to evaluate the solution based on the content of the files 'a.txt' and 'b.txt'.\nThe ",(0,a.jsx)(t.code,{children:"eval_code"})," field is used to write the evaluation code.\nYou can treat it as a normal test case in a unit test framework using the ",(0,a.jsx)(t.code,{children:"assert"})," statement.\nThe solution get the score point if the ",(0,a.jsx)(t.code,{children:"assert"})," statement does not raise an exception."]}),"\n",(0,a.jsx)(t.p,{children:"We provide additional fields in the YAML file to specify the evaluation environment."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"version: the version of the evaluation file\nconfig_var: configurations of the agent for this evaluation case\napp_dir: the working directory of the agent\ndependencies: list of packages required by the agent\ndata_files: list of files copied to the working directory\nmax_rounds: the maximum number of rounds for the conversation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We have implemented the new evaluation method in TaskWeaver and prepared a set of evaluation cases in the ",(0,a.jsx)(t.code,{children:"auto_eval/cases"})," directory.\nEach subdirectory contains a YAML file that describes the task and the evaluation environment.\nTo run the evaluation, you can find more details in the\n",(0,a.jsx)(t.a,{href:"https://github.com/microsoft/TaskWeaver/blob/main/auto_eval/README.md",children:"auto_eval/README.md"})," file."]}),"\n",(0,a.jsx)(t.h2,{id:"how-to-adapt-for-other-agents",children:"How to adapt for other agents?"}),"\n",(0,a.jsxs)(t.p,{children:["Although the new evaluation method is designed for TaskWeaver, it can be applied to other agents as well,\nas long as the agent can be treated as a conversational partner.\nMore specifically, the agent should be able to instantiate as a Python object with necessary configurations and a working directory\nas we did for TaskWeaver in ",(0,a.jsx)(t.code,{children:"auto_eval/taskweaver_eval.py"}),":"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:'class TaskWeaverVirtualUser(VirtualUser):\n def __init__(self, task_description: str, app_dir: str, config_var: Optional[dict] = None):\n super().__init__(task_description)\n\n self.app = TaskWeaverApp(app_dir=app_dir, config=config_var)\n self.session = self.app.get_session()\n self.session_id = self.session.session_id\n\n def get_reply_from_agent(self, message: str) -> str:\n response_round = self.session.send_message(\n message,\n event_handler=None,\n )\n assert response_round.state != "failed", "Failed to get response from agent."\n return response_round.post_list[-1].message\n\n def close(self):\n self.app.stop()\n'})}),"\n",(0,a.jsxs)(t.p,{children:["To add another agent, you need to implement the ",(0,a.jsx)(t.code,{children:"VirtualUser"})," class and the ",(0,a.jsx)(t.code,{children:"get_reply_from_agent"}),", ",(0,a.jsx)(t.code,{children:"close"})," methods."]})]})}function c(e={}){const{wrapper:t}={...(0,o.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(l,{...e})}):l(e)}},100:(e,t,n)=>{n.d(t,{Z:()=>a});const a=n.p+"assets/images/evaluation-ac91a46e949f383154a9ffbafcfbc981.png"},1151:(e,t,n)=>{n.d(t,{Z:()=>r,a:()=>i});var a=n(7294);const o={},s=a.createContext(o);function i(e){const t=a.useContext(s);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function r(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(o):e.components||o:i(e.components),a.createElement(s.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
+"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[1899],{7832:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>h,contentTitle:()=>i,default:()=>c,frontMatter:()=>s,metadata:()=>r,toc:()=>d});var a=n(5893),o=n(1151);const s={},i="How to evaluate a LLM agent?",r={permalink:"/TaskWeaver/blog/evaluation",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/evaluation.md",source:"@site/blog/evaluation.md",title:"How to evaluate a LLM agent?",description:"The challenges",date:"2024-05-16T07:27:05.000Z",formattedDate:"May 16, 2024",tags:[],readingTime:6.29,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,nextItem:{title:"Roles in TaskWeaver",permalink:"/TaskWeaver/blog/role"}},h={authorsImageUrls:[]},d=[{value:"The challenges",id:"the-challenges",level:2},{value:"A new evaluation method",id:"a-new-evaluation-method",level:2},{value:"How to adapt for other agents?",id:"how-to-adapt-for-other-agents",level:2}];function l(e){const t={a:"a",code:"code",h2:"h2",img:"img",p:"p",pre:"pre",strong:"strong",...(0,o.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsx)(t.h2,{id:"the-challenges",children:"The challenges"}),"\n",(0,a.jsx)(t.p,{children:"It is nontrivial to evaluate the performance of a LLM agent.\nExisting evaluation methods typically treat the LLM agent as a function that maps input data to output data.\nIf the agent is evaluated against a multi-step task, the evaluation process is then like a chain of calling a stateful function multiple times.\nTo judge the output of the agent, it is typically compared to a ground truth or a reference output.\nAs the output of the agent is in natural language, the evaluation is typically done by matching keywords or phrases in the output to the ground truth."}),"\n",(0,a.jsx)(t.p,{children:"This evaluation method has its limitations due to its rigid nature.\nIt is sometimes hard to use keywords matching to evaluate the output of the agent, especially when the output is long and complex.\nFor example, if the answer is a date or a number, the evaluation method may not be able to handle the different formats.\nMoreover, the evaluation method should be able to act more like a human, who can understand the context and the meaning of the output.\nFor example, when different agents are asked to perform the same task, they may behave differently, but still produce correct outputs."}),"\n",(0,a.jsx)(t.p,{children:"The below example illustrates this point:"}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 1: It is sunny today in New York.\n"})}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{children:"Human: What is the weather today?\nAgent 2: Do you want to know the weather in New York today?\nHuman: Yes.\nAgent 2: It is sunny today.\n"})}),"\n",(0,a.jsx)(t.p,{children:'Compared to Agent 1, Agent 2 asks for confirmation before providing the answer, which requires more interaction with the user.\nHowever, both agents provide the correct answer to the question.\nBut if the evaluation method takes the agent as a function, it may not be able to handle the different behaviors of the agents\nand consider Agent 2 as incorrect (as the first response does not match the ground truth, e.g., "sunny").'}),"\n",(0,a.jsx)(t.h2,{id:"a-new-evaluation-method",children:"A new evaluation method"}),"\n",(0,a.jsxs)(t.p,{children:["Therefore, we propose a new evaluation method that treats the agent as a conversational partner as shown in the figure below:\n",(0,a.jsx)(t.img,{alt:"Evaluation",src:n(100).Z+"",width:"965",height:"659"}),"\nWe introduce two new roles during the evaluation process: the ",(0,a.jsx)(t.strong,{children:"Examiner"})," and the ",(0,a.jsx)(t.strong,{children:"Judge"}),".\nFor each test case, the task description is first given to the Examiner.\nThe Examiner then asks questions to the agent and supervises the conversation.\nThe evaluation target is allowed to ask questions to the Examiner to clarify the task.\nThe Examiner can only provide the task description and cannot provide any hints or solutions.\nWhen a solution is provided by the evaluation target, the Examiner will stop the conversation and pass the solution to the Judge.\nThe Judge will then evaluate the solution based on the ground truth.\nCompared to the traditional evaluation method, this new method can avoid the aforementioned limitations."]}),"\n",(0,a.jsx)(t.p,{children:'Let\'s see an example of how the new evaluation method works. The following YAML file is a task description for the task "Sum of 1 to 50".\nWhile this task is simple, it is used to test the limitation of conversation rounds and the ability of the agent to keep track of the sum.\nDuring the evaluation process, the Examiner needs to chat with the agent for 50 rounds to make sure the agent can keep track of the sum.\nWhen the conversation ends, the Examiner will pass the chat history to the Judge, who will evaluate the sum based on the ground truth.'}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |-\n The task has many rounds. The initial total sum is 0. \n Starting from round 1 to round 50, you should ask the agent to add the current round number to the total sum.\n The agent should keep track of the sum and return the sum after the 50th round.\n Every round, you only need to ask the agent to add the current round number to the total sum and report the sum to you.\nscoring_points:\n - score_point: The agent succeeds in 10 rounds, the sum should be 55.\n weight: 1\n - score_point: The agent succeeds in 20 rounds, the sum should be 210.\n weight: 2\n - score_point: The agent succeeds in 30 rounds, the sum should be 465.\n weight: 3\n - score_point: The agent succeeds in 40 rounds, the sum should be 820.\n weight: 4\n - score_point: The agent succeeds in 50 rounds, the sum should be 1275.\n weight: 5\n"})}),"\n",(0,a.jsxs)(t.p,{children:["The ground truth is represented by the ",(0,a.jsx)(t.code,{children:"scoring_points"})," field in the YAML file.\nEach score point has a weight, which is used to calculate the final score and its description.\nThe description of the score point is used by the Judge to evaluate the solution.\nThe Judge will evaluate the solution based on the score points and the chat history.\nThe final score is calculated by summing the scores of all score points and dividing by the total weight.\nTherefore, the normalized score is between 0 and 1."]}),"\n",(0,a.jsx)(t.p,{children:"In some cases, it may require a more precise way to evaluate the solution, e.g., with code.\nThis following task description is an example of such a case."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"task_description: |- \n The task is to send 3 requests one-by-one and get the agent responses, no need to check the response content: \n 1. generate 1 random integer number and save it to a file named 'a.txt', just tell me if the task is done\n 2. tell me a random joke\n 3. save the previously generated random number to a file named 'b.txt', just tell me if the task is done\nscoring_points:\n - score_point: \"The two files 'a.txt' and 'b.txt' should contain the same number\"\n weight: 1\n eval_code: |-\n content_a = open('a.txt', 'r').read().strip()\n content_b = open('b.txt', 'r').read().strip()\n assert content_a == content_b, f\"content of a.txt: {content_a}, content of b.txt: {content_b}\"\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We need to evaluate the solution based on the content of the files 'a.txt' and 'b.txt'.\nThe ",(0,a.jsx)(t.code,{children:"eval_code"})," field is used to write the evaluation code.\nYou can treat it as a normal test case in a unit test framework using the ",(0,a.jsx)(t.code,{children:"assert"})," statement.\nThe solution get the score point if the ",(0,a.jsx)(t.code,{children:"assert"})," statement does not raise an exception."]}),"\n",(0,a.jsx)(t.p,{children:"We provide additional fields in the YAML file to specify the evaluation environment."}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-yaml",children:"version: the version of the evaluation file\nconfig_var: configurations of the agent for this evaluation case\napp_dir: the working directory of the agent\ndependencies: list of packages required by the agent\ndata_files: list of files copied to the working directory\nmax_rounds: the maximum number of rounds for the conversation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["We have implemented the new evaluation method in TaskWeaver and prepared a set of evaluation cases in the ",(0,a.jsx)(t.code,{children:"auto_eval/cases"})," directory.\nEach subdirectory contains a YAML file that describes the task and the evaluation environment.\nTo run the evaluation, you can find more details in the\n",(0,a.jsx)(t.a,{href:"https://github.com/microsoft/TaskWeaver/blob/main/auto_eval/README.md",children:"auto_eval/README.md"})," file."]}),"\n",(0,a.jsx)(t.h2,{id:"how-to-adapt-for-other-agents",children:"How to adapt for other agents?"}),"\n",(0,a.jsxs)(t.p,{children:["Although the new evaluation method is designed for TaskWeaver, it can be applied to other agents as well,\nas long as the agent can be treated as a conversational partner.\nMore specifically, the agent should be able to instantiate as a Python object with necessary configurations and a working directory\nas we did for TaskWeaver in ",(0,a.jsx)(t.code,{children:"auto_eval/taskweaver_eval.py"}),":"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:'class TaskWeaverVirtualUser(VirtualUser):\n def __init__(self, task_description: str, app_dir: str, config_var: Optional[dict] = None):\n super().__init__(task_description)\n\n self.app = TaskWeaverApp(app_dir=app_dir, config=config_var)\n self.session = self.app.get_session()\n self.session_id = self.session.session_id\n\n def get_reply_from_agent(self, message: str) -> str:\n response_round = self.session.send_message(\n message,\n event_handler=None,\n )\n assert response_round.state != "failed", "Failed to get response from agent."\n return response_round.post_list[-1].message\n\n def close(self):\n self.app.stop()\n'})}),"\n",(0,a.jsxs)(t.p,{children:["To add another agent, you need to implement the ",(0,a.jsx)(t.code,{children:"VirtualUser"})," class and the ",(0,a.jsx)(t.code,{children:"get_reply_from_agent"}),", ",(0,a.jsx)(t.code,{children:"close"})," methods."]})]})}function c(e={}){const{wrapper:t}={...(0,o.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(l,{...e})}):l(e)}},100:(e,t,n)=>{n.d(t,{Z:()=>a});const a=n.p+"assets/images/evaluation-ac91a46e949f383154a9ffbafcfbc981.png"},1151:(e,t,n)=>{n.d(t,{Z:()=>r,a:()=>i});var a=n(7294);const o={},s=a.createContext(o);function i(e){const t=a.useContext(s);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function r(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(o):e.components||o:i(e.components),a.createElement(s.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
diff --git a/assets/js/fa48389a.293e12eb.js b/assets/js/fa48389a.319cf6b0.js
similarity index 99%
rename from assets/js/fa48389a.293e12eb.js
rename to assets/js/fa48389a.319cf6b0.js
index 3411cd26..92a880dd 100644
--- a/assets/js/fa48389a.293e12eb.js
+++ b/assets/js/fa48389a.319cf6b0.js
@@ -1 +1 @@
-"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[7762],{8610:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>l,contentTitle:()=>s,default:()=>c,frontMatter:()=>o,metadata:()=>i,toc:()=>h});var a=n(5893),r=n(1151);const o={},s="Roles in TaskWeaver",i={permalink:"/TaskWeaver/blog/role",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/role.md",source:"@site/blog/role.md",title:"Roles in TaskWeaver",description:'We frame TaskWeaver as a code-first agent framework. The term "code-first" means that the agent is designed to',date:"2024-05-15T07:38:43.000Z",formattedDate:"May 15, 2024",tags:[],readingTime:6.135,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,prevItem:{title:"How to evaluate a LLM agent?",permalink:"/TaskWeaver/blog/evaluation"}},l={authorsImageUrls:[]},h=[];function d(e){const t={a:"a",code:"code",em:"em",mermaid:"mermaid",p:"p",pre:"pre",strong:"strong",...(0,r.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsxs)(t.p,{children:["We frame TaskWeaver as a ",(0,a.jsx)(t.strong,{children:"code-first"})," agent framework. The term \"code-first\" means that the agent is designed to\nconvert the user's request into one or multiple runnable code snippets and then execute them to generate the response.\nThe philosophy behind this design is to consider programming languages as the de facto language for communication in cyber-physical systems,\njust like the natural language for human communication. Therefore, TaskWeaver translates the user's request in natural language into\nprogramming languages, which can be executed by the system to perform the desired tasks."]}),"\n",(0,a.jsxs)(t.p,{children:["Under this design, when the developer needs to extend the agent's capability, they can write a new plugin.\nA plugin is a piece of code wrapped in a class that can be called as a function by the agent in the generated code snippets.\nLet's consider an example: ",(0,a.jsx)(t.em,{children:"the agent is asked to load a CSV file and perform anomaly detection on the data"}),".\nThe workflow of the agent is in the diagram below. It is very natural to represent data to be processed in variables and this task in code snippets."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read a.csv and perform \n anomaly detection"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"read a.csv and call the \n anomaly_detection plugin\n to find anomalies in the data"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"df=pd.read_csv(\'a.csv\')\n anomaly_df=anomaly_detection(df)"--\x3e D[Code Executor]\n end\n end\n D --result--\x3e B\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["However, we do find challenges for other tasks that are not naturally represented in code snippets.\nLet's consider another example: ",(0,a.jsx)(t.em,{children:"the agent is asked to read a manual and follow the instructions to process the data"}),".\nWe first assume there is a plugin that can read the manual and extract the instructions, called ",(0,a.jsx)(t.code,{children:"read_manual"}),".\nThe workflow of the agent is in the diagram below.\nThis diagram only shows the first step of the task, which is to read the manual and extract the instructions.\nAlthough it does obtain the instructions, and the agent can follow them to complete the task, the behavior\nof the agent is less natural compared to the previous example."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read the manual and follow \n the instructions to process the data"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1: read the manual by \n calling the read_manual \n plugin to extract the instructions"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"instructions=read_manual()\n follow_instructions(instructions)"--\x3e D[Code Executor]\n end\n end\n D --instructions--\x3e B'}),"\n",(0,a.jsxs)(t.p,{children:["Why? First, there is no need to generate code to read the manual and extract the instructions.\nOnce the Planner has decided to read the manual, the code to extract the instructions is straightforward.\nEven though that there might be dynamic parts in the code such as some arguments in the function ",(0,a.jsx)(t.code,{children:"read_manual"}),",\nit could be handled by the Planner. Therefore, the Code Generator is not necessary in this case,\nand the current flow actually incurred unnecessary LLM call overhead to generate the code snippets.\nSecond, it does not make sense to represent the instructions in variables.\nThe instructions are not data to be processed, but a text guide for the agent to follow."]}),"\n",(0,a.jsxs)(t.p,{children:["For these reasons, we introduced the concept of ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"roles"})," in TaskWeaver.\nRoles are actually not new in TaskWeaver as there are already roles like ",(0,a.jsx)(t.code,{children:"Planner"})," and ",(0,a.jsx)(t.code,{children:"CodeInterpreter"}),".\nTo add a new role, the developer can follow the documentation ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"here"}),".\nIn general, a role is a class that inherits the ",(0,a.jsx)(t.code,{children:"Role"})," class and implements the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nThe ",(0,a.jsx)(t.code,{children:"reply"})," method is the function that the agent calls to interact with the role, which has the\nfollowing signature:"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:"def reply(self, memory: Memory, **kwargs) -> Post:\n # implementation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["It takes the ",(0,a.jsx)(t.code,{children:"memory"})," object, which is the memory of the agent, and returns a ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object, which is the response of the role to the Planner.\nWith the ",(0,a.jsx)(t.code,{children:"memory"})," object, the role can access the history of the conversation and the context of the conversation.\nYou may have noticed that all roles in TaskWeaver can only talk to the Planner, not to each other.\nIf a role needs to talk to another role, it should go through the Planner.\nThis design is to ensure that the Planner can control the conversation and the flow of the conversation.\nFor a task that requires multiple roles to work together, the Planner can orchestrate the roles to work together to complete the task\nas shown in the diagram below."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"request"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1"--\x3e C[Role 1]\n C --reply--\x3e B\n B --"step 2"--\x3e D[Role 2]\n D --reply--\x3e B\n B --"step 3"--\x3e E[Role 3]\n E --reply--\x3e B\n end\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["The communication between the Planner and the roles is done through the ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object.\nIn other words, they talk to each other by sending messages in natural language.\nWhat if a role needs to send some data to another role? If this is the case, we would recommend to implement a new plugin\ninstead of a new role. Otherwise, you may need to store the data in an external storage like a database and let the other role to access it."]}),"\n",(0,a.jsxs)(t.p,{children:["There is a challenge in implementing multiple roles that is missing information.\nConsider the case in our previous example where the agent is asked to read a manual and follow the instructions to process the data.\nWhen the Planner obtains the instructions from a role called ",(0,a.jsx)(t.code,{children:"manual_reader"}),", it needs to pass the instructions to the CodeInterpreter role to execute the instructions.\nSometimes, the Planner may miss critical information that is needed by the CodeInterpreter role.\nEven though we can emphasize the importance of the Planner to pass all the necessary information to the roles in the prompt,\nit is still possible that the Planner misses some information."]}),"\n",(0,a.jsxs)(t.p,{children:["To address this challenge, we introduce the concept of ",(0,a.jsx)(t.code,{children:"board"})," in TaskWeaver.\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a shared memory space that can be accessed by all roles, which is associated with the current ",(0,a.jsx)(t.a,{href:"/docs/concepts/round",children:"Round"}),".\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a dictionary-like object that can store any information that is needed by the roles.\nEach role can decide to write or read any information from the ",(0,a.jsx)(t.code,{children:"board"}),"."]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:' def write_board(self, role_alias: str, bulletin: str) -> None:\n """Add a bulletin to the round."""\n self.board[role_alias] = bulletin\n\ndef read_board(self, role_alias: Optional[str] = None) -> Union[Dict[str, str], str]:\n """Read the bulletin of the round."""\n if role_alias is None:\n return self.board\n return self.board.get(role_alias, None)\n'})}),"\n",(0,a.jsxs)(t.p,{children:["One concrete example of using the ",(0,a.jsx)(t.code,{children:"board"})," is to pass the user's request to the CodeInterpreter role.\nWhen the Planner receives the user's request, it can write the request and its step-wise plan to the ",(0,a.jsx)(t.code,{children:"board"}),".\nThe CodeInterpreter role can then read the request and the plan from the ",(0,a.jsx)(t.code,{children:"board"})," to execute the plan."]}),"\n",(0,a.jsxs)(t.p,{children:["In summary, the concept of roles in TaskWeaver is to provide a way to extend the agent's capability by implementing new roles.\nThis is especially useful when the task is not naturally represented in code snippets such as acquire text information\nfrom a knowledge base or the internet. Implementing a new role is straightforward by inheriting the ",(0,a.jsx)(t.code,{children:"Role"})," class and implementing the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nAll extra roles should be put in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, which will be automatically loaded by TaskWeaver.\nWe have provided a few sample roles in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, such as the ",(0,a.jsx)(t.code,{children:"Echo"})," role that echoes the user's message back to the user.\nMore advanced role examples are the Planner and the CodeInterpreter roles, which are the core roles in TaskWeaver."]})]})}function c(e={}){const{wrapper:t}={...(0,r.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(d,{...e})}):d(e)}},1151:(e,t,n)=>{n.d(t,{Z:()=>i,a:()=>s});var a=n(7294);const r={},o=a.createContext(r);function s(e){const t=a.useContext(o);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function i(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:s(e.components),a.createElement(o.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
+"use strict";(self.webpackChunkwebsite=self.webpackChunkwebsite||[]).push([[7762],{8610:(e,t,n)=>{n.r(t),n.d(t,{assets:()=>l,contentTitle:()=>s,default:()=>c,frontMatter:()=>o,metadata:()=>i,toc:()=>h});var a=n(5893),r=n(1151);const o={},s="Roles in TaskWeaver",i={permalink:"/TaskWeaver/blog/role",editUrl:"https://github.com/microsoft/TaskWeaver/tree/main/website/blog/role.md",source:"@site/blog/role.md",title:"Roles in TaskWeaver",description:'We frame TaskWeaver as a code-first agent framework. The term "code-first" means that the agent is designed to',date:"2024-05-16T07:27:05.000Z",formattedDate:"May 16, 2024",tags:[],readingTime:6.135,hasTruncateMarker:!1,authors:[],frontMatter:{},unlisted:!1,prevItem:{title:"How to evaluate a LLM agent?",permalink:"/TaskWeaver/blog/evaluation"}},l={authorsImageUrls:[]},h=[];function d(e){const t={a:"a",code:"code",em:"em",mermaid:"mermaid",p:"p",pre:"pre",strong:"strong",...(0,r.a)(),...e.components};return(0,a.jsxs)(a.Fragment,{children:[(0,a.jsxs)(t.p,{children:["We frame TaskWeaver as a ",(0,a.jsx)(t.strong,{children:"code-first"})," agent framework. The term \"code-first\" means that the agent is designed to\nconvert the user's request into one or multiple runnable code snippets and then execute them to generate the response.\nThe philosophy behind this design is to consider programming languages as the de facto language for communication in cyber-physical systems,\njust like the natural language for human communication. Therefore, TaskWeaver translates the user's request in natural language into\nprogramming languages, which can be executed by the system to perform the desired tasks."]}),"\n",(0,a.jsxs)(t.p,{children:["Under this design, when the developer needs to extend the agent's capability, they can write a new plugin.\nA plugin is a piece of code wrapped in a class that can be called as a function by the agent in the generated code snippets.\nLet's consider an example: ",(0,a.jsx)(t.em,{children:"the agent is asked to load a CSV file and perform anomaly detection on the data"}),".\nThe workflow of the agent is in the diagram below. It is very natural to represent data to be processed in variables and this task in code snippets."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read a.csv and perform \n anomaly detection"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"read a.csv and call the \n anomaly_detection plugin\n to find anomalies in the data"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"df=pd.read_csv(\'a.csv\')\n anomaly_df=anomaly_detection(df)"--\x3e D[Code Executor]\n end\n end\n D --result--\x3e B\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["However, we do find challenges for other tasks that are not naturally represented in code snippets.\nLet's consider another example: ",(0,a.jsx)(t.em,{children:"the agent is asked to read a manual and follow the instructions to process the data"}),".\nWe first assume there is a plugin that can read the manual and extract the instructions, called ",(0,a.jsx)(t.code,{children:"read_manual"}),".\nThe workflow of the agent is in the diagram below.\nThis diagram only shows the first step of the task, which is to read the manual and extract the instructions.\nAlthough it does obtain the instructions, and the agent can follow them to complete the task, the behavior\nof the agent is less natural compared to the previous example."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"read the manual and follow \n the instructions to process the data"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1: read the manual by \n calling the read_manual \n plugin to extract the instructions"--\x3e C[Code Generator]\n subgraph Code Interpreter\n C --"instructions=read_manual()\n follow_instructions(instructions)"--\x3e D[Code Executor]\n end\n end\n D --instructions--\x3e B'}),"\n",(0,a.jsxs)(t.p,{children:["Why? First, there is no need to generate code to read the manual and extract the instructions.\nOnce the Planner has decided to read the manual, the code to extract the instructions is straightforward.\nEven though that there might be dynamic parts in the code such as some arguments in the function ",(0,a.jsx)(t.code,{children:"read_manual"}),",\nit could be handled by the Planner. Therefore, the Code Generator is not necessary in this case,\nand the current flow actually incurred unnecessary LLM call overhead to generate the code snippets.\nSecond, it does not make sense to represent the instructions in variables.\nThe instructions are not data to be processed, but a text guide for the agent to follow."]}),"\n",(0,a.jsxs)(t.p,{children:["For these reasons, we introduced the concept of ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"roles"})," in TaskWeaver.\nRoles are actually not new in TaskWeaver as there are already roles like ",(0,a.jsx)(t.code,{children:"Planner"})," and ",(0,a.jsx)(t.code,{children:"CodeInterpreter"}),".\nTo add a new role, the developer can follow the documentation ",(0,a.jsx)(t.a,{href:"/docs/concepts/role",children:"here"}),".\nIn general, a role is a class that inherits the ",(0,a.jsx)(t.code,{children:"Role"})," class and implements the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nThe ",(0,a.jsx)(t.code,{children:"reply"})," method is the function that the agent calls to interact with the role, which has the\nfollowing signature:"]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:"def reply(self, memory: Memory, **kwargs) -> Post:\n # implementation\n"})}),"\n",(0,a.jsxs)(t.p,{children:["It takes the ",(0,a.jsx)(t.code,{children:"memory"})," object, which is the memory of the agent, and returns a ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object, which is the response of the role to the Planner.\nWith the ",(0,a.jsx)(t.code,{children:"memory"})," object, the role can access the history of the conversation and the context of the conversation.\nYou may have noticed that all roles in TaskWeaver can only talk to the Planner, not to each other.\nIf a role needs to talk to another role, it should go through the Planner.\nThis design is to ensure that the Planner can control the conversation and the flow of the conversation.\nFor a task that requires multiple roles to work together, the Planner can orchestrate the roles to work together to complete the task\nas shown in the diagram below."]}),"\n",(0,a.jsx)(t.mermaid,{value:'flowchart TD\n A[User] --"request"--\x3e B[Planner]\n subgraph TaskWeaver \n B --"step 1"--\x3e C[Role 1]\n C --reply--\x3e B\n B --"step 2"--\x3e D[Role 2]\n D --reply--\x3e B\n B --"step 3"--\x3e E[Role 3]\n E --reply--\x3e B\n end\n B --response--\x3e A'}),"\n",(0,a.jsxs)(t.p,{children:["The communication between the Planner and the roles is done through the ",(0,a.jsx)(t.a,{href:"/docs/concepts/post",children:"Post"})," object.\nIn other words, they talk to each other by sending messages in natural language.\nWhat if a role needs to send some data to another role? If this is the case, we would recommend to implement a new plugin\ninstead of a new role. Otherwise, you may need to store the data in an external storage like a database and let the other role to access it."]}),"\n",(0,a.jsxs)(t.p,{children:["There is a challenge in implementing multiple roles that is missing information.\nConsider the case in our previous example where the agent is asked to read a manual and follow the instructions to process the data.\nWhen the Planner obtains the instructions from a role called ",(0,a.jsx)(t.code,{children:"manual_reader"}),", it needs to pass the instructions to the CodeInterpreter role to execute the instructions.\nSometimes, the Planner may miss critical information that is needed by the CodeInterpreter role.\nEven though we can emphasize the importance of the Planner to pass all the necessary information to the roles in the prompt,\nit is still possible that the Planner misses some information."]}),"\n",(0,a.jsxs)(t.p,{children:["To address this challenge, we introduce the concept of ",(0,a.jsx)(t.code,{children:"board"})," in TaskWeaver.\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a shared memory space that can be accessed by all roles, which is associated with the current ",(0,a.jsx)(t.a,{href:"/docs/concepts/round",children:"Round"}),".\nThe ",(0,a.jsx)(t.code,{children:"board"})," is a dictionary-like object that can store any information that is needed by the roles.\nEach role can decide to write or read any information from the ",(0,a.jsx)(t.code,{children:"board"}),"."]}),"\n",(0,a.jsx)(t.pre,{children:(0,a.jsx)(t.code,{className:"language-python",children:' def write_board(self, role_alias: str, bulletin: str) -> None:\n """Add a bulletin to the round."""\n self.board[role_alias] = bulletin\n\ndef read_board(self, role_alias: Optional[str] = None) -> Union[Dict[str, str], str]:\n """Read the bulletin of the round."""\n if role_alias is None:\n return self.board\n return self.board.get(role_alias, None)\n'})}),"\n",(0,a.jsxs)(t.p,{children:["One concrete example of using the ",(0,a.jsx)(t.code,{children:"board"})," is to pass the user's request to the CodeInterpreter role.\nWhen the Planner receives the user's request, it can write the request and its step-wise plan to the ",(0,a.jsx)(t.code,{children:"board"}),".\nThe CodeInterpreter role can then read the request and the plan from the ",(0,a.jsx)(t.code,{children:"board"})," to execute the plan."]}),"\n",(0,a.jsxs)(t.p,{children:["In summary, the concept of roles in TaskWeaver is to provide a way to extend the agent's capability by implementing new roles.\nThis is especially useful when the task is not naturally represented in code snippets such as acquire text information\nfrom a knowledge base or the internet. Implementing a new role is straightforward by inheriting the ",(0,a.jsx)(t.code,{children:"Role"})," class and implementing the ",(0,a.jsx)(t.code,{children:"reply"})," method.\nAll extra roles should be put in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, which will be automatically loaded by TaskWeaver.\nWe have provided a few sample roles in the ",(0,a.jsx)(t.code,{children:"TaskWeaver/taskweaver/ext_role"})," folder, such as the ",(0,a.jsx)(t.code,{children:"Echo"})," role that echoes the user's message back to the user.\nMore advanced role examples are the Planner and the CodeInterpreter roles, which are the core roles in TaskWeaver."]})]})}function c(e={}){const{wrapper:t}={...(0,r.a)(),...e.components};return t?(0,a.jsx)(t,{...e,children:(0,a.jsx)(d,{...e})}):d(e)}},1151:(e,t,n)=>{n.d(t,{Z:()=>i,a:()=>s});var a=n(7294);const r={},o=a.createContext(r);function s(e){const t=a.useContext(o);return a.useMemo((function(){return"function"==typeof e?e(t):{...t,...e}}),[t,e])}function i(e){let t;return t=e.disableParentContext?"function"==typeof e.components?e.components(r):e.components||r:s(e.components),a.createElement(o.Provider,{value:t},e.children)}}}]);
\ No newline at end of file
diff --git a/assets/js/runtime~main.8f074b0a.js b/assets/js/runtime~main.9211180e.js
similarity index 93%
rename from assets/js/runtime~main.8f074b0a.js
rename to assets/js/runtime~main.9211180e.js
index 6cf98a5c..65e7da0f 100644
--- a/assets/js/runtime~main.8f074b0a.js
+++ b/assets/js/runtime~main.9211180e.js
@@ -1 +1 @@
-(()=>{"use strict";var e,a,f,c,d,b={},t={};function r(e){var a=t[e];if(void 0!==a)return a.exports;var f=t[e]={exports:{}};return b[e].call(f.exports,f,f.exports,r),f.exports}r.m=b,e=[],r.O=(a,f,c,d)=>{if(!f){var b=1/0;for(i=0;i=d)&&Object.keys(r.O).every((e=>r.O[e](f[o])))?f.splice(o--,1):(t=!1,d0&&e[i-1][2]>d;i--)e[i]=e[i-1];e[i]=[f,c,d]},r.n=e=>{var a=e&&e.__esModule?()=>e.default:()=>e;return r.d(a,{a:a}),a},f=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,c){if(1&c&&(e=this(e)),8&c)return e;if("object"==typeof e&&e){if(4&c&&e.__esModule)return e;if(16&c&&"function"==typeof e.then)return e}var d=Object.create(null);r.r(d);var b={};a=a||[null,f({}),f([]),f(f)];for(var t=2&c&&e;"object"==typeof t&&!~a.indexOf(t);t=f(t))Object.getOwnPropertyNames(t).forEach((a=>b[a]=()=>e[a]));return b.default=()=>e,r.d(d,b),d},r.d=(e,a)=>{for(var f in a)r.o(a,f)&&!r.o(e,f)&&Object.defineProperty(e,f,{enumerable:!0,get:a[f]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((a,f)=>(r.f[f](e,a),a)),[])),r.u=e=>"assets/js/"+({53:"935f2afb",419:"a4259125",679:"e8e13c91",729:"0c5435fe",746:"7fc9262a",776:"873331c9",1018:"7555bb16",1070:"fbacdda6",1290:"03e8cedb",1294:"78b89a17",1372:"1db64337",1559:"30613cee",1599:"b3e09ff4",1766:"2ce24e93",1899:"c39bf4d4",1916:"dfcda4d1",2131:"a0385f53",2281:"0c4a3f3a",2429:"cf09775e",2442:"72cb6e7f",2535:"814f3328",2646:"55efe1e1",2651:"8070e160",2802:"b21ad4a1",3085:"1f391b9e",3089:"a6aa9e1f",3287:"d3234990",3424:"fe04a91d",3450:"2726c345",3608:"9e4087bc",3809:"1bff86ef",3991:"45fd52fa",4195:"c4f5d8e4",4229:"a417478a",4288:"ad895e75",4368:"a94703ab",4398:"a55bc7d4",4471:"834e34cc",4904:"a27d32e8",5167:"044bc5cf",5504:"9bc17760",5675:"ae863774",5870:"d385135b",6008:"93a501ed",6063:"47284eb1",6103:"ccc49370",6417:"88f45e24",6486:"5a1935a3",6608:"8257ffa4",7102:"d9a31669",7181:"fa377e30",7252:"6b4ad289",7288:"61db00e1",7399:"360ca471",7414:"393be207",7428:"13e97266",7762:"fa48389a",7918:"17896441",7920:"1a4e3797",8518:"a7bd4aaa",8654:"be4af720",8905:"f04cdb7e",9190:"2114d3cd",9212:"fc8fddfe",9285:"2644c4f4",9522:"792477b0",9555:"f10ee74f",9661:"5e95c892",9817:"14eb3368",9822:"a9f7b4d5",9861:"85be924b"}[e]||e)+"."+{53:"a1e2c144",109:"f79821ca",132:"0e860ffc",240:"3397428c",419:"8ce2b16e",679:"19d5237a",729:"5bd7cfd5",746:"84570a1b",776:"0b965284",1018:"2ef58bfc",1070:"7e7a2ec4",1290:"9a2b7dab",1294:"4bf54039",1372:"e8dee0f1",1504:"311080c2",1559:"e59ce107",1599:"a79d872f",1644:"236c5943",1763:"78887070",1766:"d82e9ee2",1772:"501951df",1899:"f1fc4dc0",1916:"d49fdb6b",2131:"4bb43b95",2183:"f325ef61",2281:"702219b6",2429:"3f88c550",2442:"6ae3723f",2535:"d073ebab",2646:"af4af1f3",2651:"4800f253",2661:"4ec40333",2693:"78ee6dba",2696:"21e5e3f4",2700:"c68ce667",2802:"c8713f52",3085:"0ac7cda7",3089:"f5b8d274",3287:"2d61760a",3424:"5d6b89cc",3450:"fe3a06a8",3608:"997f2ef5",3619:"9e191067",3809:"eaf30c0a",3991:"6413b7e3",4195:"902a52cf",4229:"aba6c885",4238:"ba414065",4288:"aedf49e0",4368:"3856b628",4398:"a4e23676",4471:"9d7a5eb5",4706:"1aeac5d9",4904:"42aafddf",5167:"5af86d3d",5269:"ac755118",5326:"1440cd5a",5504:"a5dd80e7",5525:"36b2ab15",5675:"f641eabd",5790:"b37ddec6",5870:"c6daec6b",5886:"81854eda",5943:"cfa72ddd",6008:"b7f6443f",6063:"67b4f106",6103:"39082596",6255:"6d06eead",6417:"2209c2ae",6486:"087eb325",6608:"1f072f7d",6648:"4ce822b7",6985:"075ba3c5",7102:"eeb517c9",7181:"ead5cf33",7252:"ef04d27c",7288:"2b76cfa1",7399:"5f1b9455",7414:"c8f1cefb",7428:"b079a898",7762:"293e12eb",7779:"bb31db26",7918:"ab268e7a",7920:"c829ba7b",7936:"c7718e7a",8016:"ffc43652",8443:"30345cef",8518:"687d6c08",8654:"190b348f",8905:"0cdf2063",8955:"c13c87ac",9138:"86515bcb",9190:"793259c4",9212:"2bf4913f",9285:"27d801e9",9522:"73d1095a",9555:"2e3f0992",9661:"7e330830",9677:"b87438ef",9817:"61c5c133",9822:"9fd28b0e",9840:"ba9cfc9d",9861:"aeed1ffb",9893:"b33d789f"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,a)=>Object.prototype.hasOwnProperty.call(e,a),c={},d="website:",r.l=(e,a,f,b)=>{if(c[e])c[e].push(a);else{var t,o;if(void 0!==f)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var d=c[e];if(delete c[e],t.parentNode&&t.parentNode.removeChild(t),d&&d.forEach((e=>e(f))),a)return a(f)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/TaskWeaver/",r.gca=function(e){return e={17896441:"7918","935f2afb":"53",a4259125:"419",e8e13c91:"679","0c5435fe":"729","7fc9262a":"746","873331c9":"776","7555bb16":"1018",fbacdda6:"1070","03e8cedb":"1290","78b89a17":"1294","1db64337":"1372","30613cee":"1559",b3e09ff4:"1599","2ce24e93":"1766",c39bf4d4:"1899",dfcda4d1:"1916",a0385f53:"2131","0c4a3f3a":"2281",cf09775e:"2429","72cb6e7f":"2442","814f3328":"2535","55efe1e1":"2646","8070e160":"2651",b21ad4a1:"2802","1f391b9e":"3085",a6aa9e1f:"3089",d3234990:"3287",fe04a91d:"3424","2726c345":"3450","9e4087bc":"3608","1bff86ef":"3809","45fd52fa":"3991",c4f5d8e4:"4195",a417478a:"4229",ad895e75:"4288",a94703ab:"4368",a55bc7d4:"4398","834e34cc":"4471",a27d32e8:"4904","044bc5cf":"5167","9bc17760":"5504",ae863774:"5675",d385135b:"5870","93a501ed":"6008","47284eb1":"6063",ccc49370:"6103","88f45e24":"6417","5a1935a3":"6486","8257ffa4":"6608",d9a31669:"7102",fa377e30:"7181","6b4ad289":"7252","61db00e1":"7288","360ca471":"7399","393be207":"7414","13e97266":"7428",fa48389a:"7762","1a4e3797":"7920",a7bd4aaa:"8518",be4af720:"8654",f04cdb7e:"8905","2114d3cd":"9190",fc8fddfe:"9212","2644c4f4":"9285","792477b0":"9522",f10ee74f:"9555","5e95c892":"9661","14eb3368":"9817",a9f7b4d5:"9822","85be924b":"9861"}[e]||e,r.p+r.u(e)},(()=>{var e={1303:0,532:0};r.f.j=(a,f)=>{var c=r.o(e,a)?e[a]:void 0;if(0!==c)if(c)f.push(c[2]);else if(/^(1303|532)$/.test(a))e[a]=0;else{var d=new Promise(((f,d)=>c=e[a]=[f,d]));f.push(c[2]=d);var b=r.p+r.u(a),t=new Error;r.l(b,(f=>{if(r.o(e,a)&&(0!==(c=e[a])&&(e[a]=void 0),c)){var d=f&&("load"===f.type?"missing":f.type),b=f&&f.target&&f.target.src;t.message="Loading chunk "+a+" failed.\n("+d+": "+b+")",t.name="ChunkLoadError",t.type=d,t.request=b,c[1](t)}}),"chunk-"+a,a)}},r.O.j=a=>0===e[a];var a=(a,f)=>{var c,d,b=f[0],t=f[1],o=f[2],n=0;if(b.some((a=>0!==e[a]))){for(c in t)r.o(t,c)&&(r.m[c]=t[c]);if(o)var i=o(r)}for(a&&a(f);n{"use strict";var e,a,f,c,d,b={},t={};function r(e){var a=t[e];if(void 0!==a)return a.exports;var f=t[e]={exports:{}};return b[e].call(f.exports,f,f.exports,r),f.exports}r.m=b,e=[],r.O=(a,f,c,d)=>{if(!f){var b=1/0;for(i=0;i=d)&&Object.keys(r.O).every((e=>r.O[e](f[o])))?f.splice(o--,1):(t=!1,d0&&e[i-1][2]>d;i--)e[i]=e[i-1];e[i]=[f,c,d]},r.n=e=>{var a=e&&e.__esModule?()=>e.default:()=>e;return r.d(a,{a:a}),a},f=Object.getPrototypeOf?e=>Object.getPrototypeOf(e):e=>e.__proto__,r.t=function(e,c){if(1&c&&(e=this(e)),8&c)return e;if("object"==typeof e&&e){if(4&c&&e.__esModule)return e;if(16&c&&"function"==typeof e.then)return e}var d=Object.create(null);r.r(d);var b={};a=a||[null,f({}),f([]),f(f)];for(var t=2&c&&e;"object"==typeof t&&!~a.indexOf(t);t=f(t))Object.getOwnPropertyNames(t).forEach((a=>b[a]=()=>e[a]));return b.default=()=>e,r.d(d,b),d},r.d=(e,a)=>{for(var f in a)r.o(a,f)&&!r.o(e,f)&&Object.defineProperty(e,f,{enumerable:!0,get:a[f]})},r.f={},r.e=e=>Promise.all(Object.keys(r.f).reduce(((a,f)=>(r.f[f](e,a),a)),[])),r.u=e=>"assets/js/"+({53:"935f2afb",419:"a4259125",679:"e8e13c91",729:"0c5435fe",746:"7fc9262a",776:"873331c9",1018:"7555bb16",1070:"fbacdda6",1290:"03e8cedb",1294:"78b89a17",1372:"1db64337",1559:"30613cee",1599:"b3e09ff4",1766:"2ce24e93",1899:"c39bf4d4",1916:"dfcda4d1",2131:"a0385f53",2281:"0c4a3f3a",2429:"cf09775e",2442:"72cb6e7f",2535:"814f3328",2646:"55efe1e1",2651:"8070e160",2802:"b21ad4a1",3085:"1f391b9e",3089:"a6aa9e1f",3287:"d3234990",3424:"fe04a91d",3450:"2726c345",3608:"9e4087bc",3809:"1bff86ef",3991:"45fd52fa",4195:"c4f5d8e4",4229:"a417478a",4288:"ad895e75",4368:"a94703ab",4398:"a55bc7d4",4471:"834e34cc",4904:"a27d32e8",5167:"044bc5cf",5504:"9bc17760",5675:"ae863774",5870:"d385135b",6008:"93a501ed",6063:"47284eb1",6103:"ccc49370",6417:"88f45e24",6486:"5a1935a3",6608:"8257ffa4",7102:"d9a31669",7181:"fa377e30",7252:"6b4ad289",7288:"61db00e1",7399:"360ca471",7414:"393be207",7428:"13e97266",7762:"fa48389a",7918:"17896441",7920:"1a4e3797",8518:"a7bd4aaa",8654:"be4af720",8905:"f04cdb7e",9190:"2114d3cd",9212:"fc8fddfe",9285:"2644c4f4",9522:"792477b0",9555:"f10ee74f",9661:"5e95c892",9817:"14eb3368",9822:"a9f7b4d5",9861:"85be924b"}[e]||e)+"."+{53:"a1e2c144",109:"f79821ca",132:"0e860ffc",240:"3397428c",419:"8ce2b16e",679:"19d5237a",729:"5bd7cfd5",746:"84570a1b",776:"0b965284",1018:"2ef58bfc",1070:"7e7a2ec4",1290:"9a2b7dab",1294:"4bf54039",1372:"e8dee0f1",1504:"311080c2",1559:"e59ce107",1599:"a79d872f",1644:"236c5943",1763:"78887070",1766:"d82e9ee2",1772:"501951df",1899:"59fa2419",1916:"d49fdb6b",2131:"4bb43b95",2183:"f325ef61",2281:"702219b6",2429:"3f88c550",2442:"6ae3723f",2535:"d073ebab",2646:"af4af1f3",2651:"4800f253",2661:"4ec40333",2693:"78ee6dba",2696:"21e5e3f4",2700:"c68ce667",2802:"c8713f52",3085:"0ac7cda7",3089:"f5b8d274",3287:"2d61760a",3424:"5d6b89cc",3450:"fe3a06a8",3608:"997f2ef5",3619:"9e191067",3809:"a54ebcb3",3991:"6413b7e3",4195:"902a52cf",4229:"aba6c885",4238:"ba414065",4288:"aedf49e0",4368:"3856b628",4398:"a4e23676",4471:"9d7a5eb5",4706:"1aeac5d9",4904:"544cc49c",5167:"5af86d3d",5269:"ac755118",5326:"1440cd5a",5504:"a5dd80e7",5525:"36b2ab15",5675:"f641eabd",5790:"b37ddec6",5870:"c6daec6b",5886:"81854eda",5943:"cfa72ddd",6008:"b7f6443f",6063:"67b4f106",6103:"39082596",6255:"6d06eead",6417:"2209c2ae",6486:"087eb325",6608:"1f072f7d",6648:"4ce822b7",6985:"075ba3c5",7102:"eeb517c9",7181:"ead5cf33",7252:"ef04d27c",7288:"2b76cfa1",7399:"5f1b9455",7414:"c8f1cefb",7428:"b079a898",7762:"319cf6b0",7779:"bb31db26",7918:"ab268e7a",7920:"c829ba7b",7936:"c7718e7a",8016:"ffc43652",8443:"30345cef",8518:"687d6c08",8654:"190b348f",8905:"0cdf2063",8955:"c13c87ac",9138:"86515bcb",9190:"793259c4",9212:"2bf4913f",9285:"27d801e9",9522:"e145da8b",9555:"2e3f0992",9661:"7e330830",9677:"b87438ef",9817:"61c5c133",9822:"9fd28b0e",9840:"ba9cfc9d",9861:"aeed1ffb",9893:"b33d789f"}[e]+".js",r.miniCssF=e=>{},r.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),r.o=(e,a)=>Object.prototype.hasOwnProperty.call(e,a),c={},d="website:",r.l=(e,a,f,b)=>{if(c[e])c[e].push(a);else{var t,o;if(void 0!==f)for(var n=document.getElementsByTagName("script"),i=0;i{t.onerror=t.onload=null,clearTimeout(s);var d=c[e];if(delete c[e],t.parentNode&&t.parentNode.removeChild(t),d&&d.forEach((e=>e(f))),a)return a(f)},s=setTimeout(l.bind(null,void 0,{type:"timeout",target:t}),12e4);t.onerror=l.bind(null,t.onerror),t.onload=l.bind(null,t.onload),o&&document.head.appendChild(t)}},r.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},r.p="/TaskWeaver/",r.gca=function(e){return e={17896441:"7918","935f2afb":"53",a4259125:"419",e8e13c91:"679","0c5435fe":"729","7fc9262a":"746","873331c9":"776","7555bb16":"1018",fbacdda6:"1070","03e8cedb":"1290","78b89a17":"1294","1db64337":"1372","30613cee":"1559",b3e09ff4:"1599","2ce24e93":"1766",c39bf4d4:"1899",dfcda4d1:"1916",a0385f53:"2131","0c4a3f3a":"2281",cf09775e:"2429","72cb6e7f":"2442","814f3328":"2535","55efe1e1":"2646","8070e160":"2651",b21ad4a1:"2802","1f391b9e":"3085",a6aa9e1f:"3089",d3234990:"3287",fe04a91d:"3424","2726c345":"3450","9e4087bc":"3608","1bff86ef":"3809","45fd52fa":"3991",c4f5d8e4:"4195",a417478a:"4229",ad895e75:"4288",a94703ab:"4368",a55bc7d4:"4398","834e34cc":"4471",a27d32e8:"4904","044bc5cf":"5167","9bc17760":"5504",ae863774:"5675",d385135b:"5870","93a501ed":"6008","47284eb1":"6063",ccc49370:"6103","88f45e24":"6417","5a1935a3":"6486","8257ffa4":"6608",d9a31669:"7102",fa377e30:"7181","6b4ad289":"7252","61db00e1":"7288","360ca471":"7399","393be207":"7414","13e97266":"7428",fa48389a:"7762","1a4e3797":"7920",a7bd4aaa:"8518",be4af720:"8654",f04cdb7e:"8905","2114d3cd":"9190",fc8fddfe:"9212","2644c4f4":"9285","792477b0":"9522",f10ee74f:"9555","5e95c892":"9661","14eb3368":"9817",a9f7b4d5:"9822","85be924b":"9861"}[e]||e,r.p+r.u(e)},(()=>{var e={1303:0,532:0};r.f.j=(a,f)=>{var c=r.o(e,a)?e[a]:void 0;if(0!==c)if(c)f.push(c[2]);else if(/^(1303|532)$/.test(a))e[a]=0;else{var d=new Promise(((f,d)=>c=e[a]=[f,d]));f.push(c[2]=d);var b=r.p+r.u(a),t=new Error;r.l(b,(f=>{if(r.o(e,a)&&(0!==(c=e[a])&&(e[a]=void 0),c)){var d=f&&("load"===f.type?"missing":f.type),b=f&&f.target&&f.target.src;t.message="Loading chunk "+a+" failed.\n("+d+": "+b+")",t.name="ChunkLoadError",t.type=d,t.request=b,c[1](t)}}),"chunk-"+a,a)}},r.O.j=a=>0===e[a];var a=(a,f)=>{var c,d,b=f[0],t=f[1],o=f[2],n=0;if(b.some((a=>0!==e[a]))){for(c in t)r.o(t,c)&&(r.m[c]=t[c]);if(o)var i=o(r)}for(a&&a(f);nArchive | TaskWeaver
-
+
-