<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>Task on Producthunt daily</title>
        <link>https://producthunt.programnotes.cn/en/tags/task/</link>
        <description>Recent content in Task on Producthunt daily</description>
        <generator>Hugo -- gohugo.io</generator>
        <language>en</language>
        <lastBuildDate>Thu, 21 Aug 2025 15:31:31 +0800</lastBuildDate><atom:link href="https://producthunt.programnotes.cn/en/tags/task/index.xml" rel="self" type="application/rss+xml" /><item>
        <title>terminal-bench</title>
        <link>https://producthunt.programnotes.cn/en/p/terminal-bench/</link>
        <pubDate>Thu, 21 Aug 2025 15:31:31 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/terminal-bench/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1648730665635-4f2b670cd921?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTU3NjE0MDF8&amp;ixlib=rb-4.1.0" alt="Featured image of post terminal-bench" /&gt;&lt;h1 id=&#34;laude-instituteterminal-bench&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/laude-institute/terminal-bench&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;laude-institute/terminal-bench&lt;/a&gt;
&lt;/h1&gt;&lt;h1 id=&#34;terminal-bench&#34;&gt;terminal-bench
&lt;/h1&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-text&#34; data-lang=&#34;text&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#####################################################################
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#  _____                   _             _     ______________       #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;# |_   _|__ _ __ _ __ ___ (_)_ __   __ _| |   ||            ||      #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#   | |/ _ \ &amp;#39;__| &amp;#39;_ ` _ \| | &amp;#39;_ \ / _` | |   || &amp;gt;          ||      #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#   | |  __/ |  | | | | | | | | | | (_| | |   ||            ||      #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#   |_|\___|_|  |_| |_| |_|_|_| |_|\__,_|_|   ||____________||      #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#   ____                  _                   |______________|      #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#  | __ )  ___ _ __   ___| |__                 \\############\\     #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#  |  _ \ / _ \ &amp;#39;_ \ / __| &amp;#39;_ \                 \\############\\    # 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#  | |_) |  __/ | | | (__| | | |                 \      ____    \   #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#  |____/ \___|_| |_|\___|_| |_|                  \_____\___\____\  #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#                                                                   #
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;#####################################################################
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://discord.gg/6xWPKhGDbA&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/Join_our_discord-5865F2?style=for-the-badge&amp;amp;logo=discord&amp;amp;logoColor=white&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Discord&#34;
	
	
&gt;&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/laude-institute/terminal-bench&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/T--Bench-000000?style=for-the-badge&amp;amp;logo=github&amp;amp;logoColor=000&amp;amp;logoColor=white&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Github&#34;
	
	
&gt;&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/Docs-000000?style=for-the-badge&amp;amp;logo=mdbook&amp;amp;color=105864&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Docs&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Terminal-Bench is the benchmark for testing AI agents in real terminal environments. From compiling code to training models and setting up servers, Terminal-Bench evaluates how well agents can handle real-world, end-to-end tasks - autonomously.&lt;/p&gt;
&lt;p&gt;Whether you&amp;rsquo;re building LLM agents, benchmarking frameworks, or stress-testing system-level reasoning, Terminal-Bench gives you a reproducible task suite and execution harness designed for practical, real-world evaluation.&lt;/p&gt;
&lt;p&gt;Terminal-Bench consists of two parts: a &lt;strong&gt;dataset of tasks&lt;/strong&gt;, and an &lt;strong&gt;execution harness&lt;/strong&gt; that connects a language model to our terminal sandbox.&lt;/p&gt;
&lt;p&gt;Terminal-Bench is currently in &lt;strong&gt;beta&lt;/strong&gt; with ~100 tasks. Over the coming months, we are going to expand Terminal-Bench into comprehensive testbed for AI agents in text-based environments. Any contributions are welcome, especially new and challenging tasks!&lt;/p&gt;
&lt;h2 id=&#34;quickstart&#34;&gt;Quickstart
&lt;/h2&gt;&lt;p&gt;Our &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/installation&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Quickstart Guide&lt;/a&gt; will walk you through installing the repo and contributing.&lt;/p&gt;
&lt;p&gt;Terminal-Bench is distributed as a pip package and can be run using the Terminal-Bench CLI: &lt;code&gt;tb&lt;/code&gt;.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;uv tool install terminal-bench
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;or&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install terminal-bench
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;further-documentation&#34;&gt;Further Documentation
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/tasks&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Task Gallery&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/task-ideas&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Task Ideas&lt;/a&gt; - Browse community-sourced task ideas&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/dashboard&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Dashboard Documentation&lt;/a&gt; - Information about the Terminal-Bench dashboard&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;core-components&#34;&gt;Core Components
&lt;/h2&gt;&lt;h3 id=&#34;dataset-of-tasks&#34;&gt;Dataset of Tasks
&lt;/h3&gt;&lt;p&gt;Each task in Terminal-Bench includes&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;a instruction in English,&lt;/li&gt;
&lt;li&gt;a test script to verify if the language model / agent completed the task successfully,&lt;/li&gt;
&lt;li&gt;a reference (&amp;ldquo;oracle&amp;rdquo;) solution that solves the task.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Tasks are located in the &lt;a class=&#34;link&#34; href=&#34;./tasks&#34; &gt;&lt;code&gt;tasks&lt;/code&gt;&lt;/a&gt; folder of the repository, and the aforementioned list of current tasks gives an overview that is easy to browse.&lt;/p&gt;
&lt;h3 id=&#34;execution-harness&#34;&gt;Execution Harness
&lt;/h3&gt;&lt;p&gt;The harness connects language models to a sandboxed terminal environment. After &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/installation&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;installing the terminal-bench package&lt;/a&gt; (along with the dependencies &lt;code&gt;uv&lt;/code&gt; and &lt;code&gt;Docker&lt;/code&gt;) you can view how to run the harness using:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;tb run --help
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For detailed information about running the harness and its options, see the &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/first-steps&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;documentation&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;submit-to-our-leaderboard&#34;&gt;Submit to Our Leaderboard
&lt;/h3&gt;&lt;p&gt;Terminal-Bench-Core v0.1.1 is the set of tasks for Terminal-Bench&amp;rsquo;s beta release and corresponds to the current leaderboard. To evaluate on it pass &lt;code&gt;--dataset-name terminal-bench-core&lt;/code&gt; and &lt;code&gt;--dataset-version 0.1.1&lt;/code&gt; to the harness. For example:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;tb run &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --agent terminus &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --model-name anthropic/claude-3-7-latest &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --dataset-name terminal-bench-core
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --dataset-version 0.1.1
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --n-concurrent &lt;span class=&#34;m&#34;&gt;8&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For more detailed instructions on submitting to the leaderboard, view our &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/submitting-to-leaderboard&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;leaderboard submission guide&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;For more information on Terminal-Bench datasets and versioning view our &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/registry&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;registry overview&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;creating-new-tasks&#34;&gt;Creating New Tasks
&lt;/h2&gt;&lt;p&gt;View our &lt;a class=&#34;link&#34; href=&#34;https://www.tbench.ai/docs/task-quickstart&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;task contribution quickstart&lt;/a&gt; to create a new task.&lt;/p&gt;
&lt;h2 id=&#34;citing-us&#34;&gt;Citing Us
&lt;/h2&gt;&lt;p&gt;If you found Terminal-Bench useful, please cite us as:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bibtex&#34; data-lang=&#34;bibtex&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@misc&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;tbench_2025&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Terminal-Bench: A Benchmark for AI Agents in Terminal Environments}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;na&#34;&gt;url&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{https://github.com/laude-institute/terminal-bench}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;      &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{The Terminal-Bench Team}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2025}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;na&#34;&gt;month&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Apr}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;</description>
        </item>
        
    </channel>
</rss>
