<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>Multimodal on Producthunt daily</title>
        <link>https://producthunt.programnotes.cn/en/tags/multimodal/</link>
        <description>Recent content in Multimodal on Producthunt daily</description>
        <generator>Hugo -- gohugo.io</generator>
        <language>en</language>
        <lastBuildDate>Sun, 21 Sep 2025 15:24:57 +0800</lastBuildDate><atom:link href="https://producthunt.programnotes.cn/en/tags/multimodal/index.xml" rel="self" type="application/rss+xml" /><item>
        <title>OM1</title>
        <link>https://producthunt.programnotes.cn/en/p/om1/</link>
        <pubDate>Sun, 21 Sep 2025 15:24:57 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/om1/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1675021278785-adc2f7d92173?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTg0MzkzOTB8&amp;ixlib=rb-4.1.0" alt="Featured image of post OM1" /&gt;&lt;h1 id=&#34;openmindom1&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenMind/OM1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenMind/OM1&lt;/a&gt;
&lt;/h1&gt;&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/853153b7-351a-433d-9e1a-d257b781f93c&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;OM_Banner_X2 (1)&#34;
	
	
&gt;&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;  &lt;a href=&#34;https://arxiv.org/abs/2412.18588&#34;&gt;Technical Paper&lt;/a&gt; |  &lt;a href=&#34;https://docs.openmind.org/&#34;&gt;Documentation&lt;/a&gt; |  &lt;a href=&#34;https://x.com/openmind_agi&#34;&gt;X&lt;/a&gt; | &lt;a href=&#34;https://discord.gg/VUjpg4ef5n&#34;&gt;Discord&lt;/a&gt; &lt;/p&gt;
&lt;p&gt;&lt;strong&gt;OpenMind&amp;rsquo;s OM1 is a modular AI runtime that empowers developers to create and deploy multimodal AI agents across digital environments and physical robots&lt;/strong&gt;, including Humanoids, Phone Apps, websites, Quadrupeds, and educational robots such as TurtleBot 4. OM1 agents can process diverse inputs like web data, social media, camera feeds, and LIDAR, while enabling physical actions including motion, autonomous navigation, and natural conversations. The goal of OM1 is to make it easy to create highly capable human-focused robots, that are easy to upgrade and (re)configure to accommodate different physical form factors.&lt;/p&gt;
&lt;h2 id=&#34;capabilities-of-om1&#34;&gt;Capabilities of OM1
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Modular Architecture&lt;/strong&gt;: Designed with Python for simplicity and seamless integration.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Data Input&lt;/strong&gt;: Easily handles new data and sensors.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Hardware Support via Plugins&lt;/strong&gt;: Supports new hardware through plugins for API endpoints and specific robot hardware connections to &lt;code&gt;ROS2&lt;/code&gt;, &lt;code&gt;Zenoh&lt;/code&gt;, and &lt;code&gt;CycloneDDS&lt;/code&gt;. (We recommend &lt;code&gt;Zenoh&lt;/code&gt; for all new development).&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Web-Based Debugging Display&lt;/strong&gt;: Monitor the system in action with WebSim (available at http://localhost:8000/) for easy visual debugging.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Pre-configured Endpoints&lt;/strong&gt;: Supports Voice-to-Speech, OpenAI’s &lt;code&gt;gpt-4o&lt;/code&gt;, DeepSeek, and multiple Visual Language Models (VLMs) with pre-configured endpoints for each service.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;architecture-overview&#34;&gt;Architecture Overview
&lt;/h2&gt;&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/14e9b916-4df7-4700-9336-2983c85be311&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Artboard 1@4x 1 (1)&#34;
	
	
&gt;&lt;/p&gt;
&lt;h2 id=&#34;getting-started---hello-world&#34;&gt;Getting Started - Hello World
&lt;/h2&gt;&lt;p&gt;To get started with OM1, let&amp;rsquo;s run the Spot agent. Spot uses your webcam to capture and label objects. These text captions are then sent to &lt;code&gt;OpenAI 4o&lt;/code&gt;, which returns &lt;code&gt;movement&lt;/code&gt;, &lt;code&gt;speech&lt;/code&gt; and &lt;code&gt;face&lt;/code&gt; action commands. These commands are displayed on WebSim along with basic timing and other debugging information.&lt;/p&gt;
&lt;h3 id=&#34;package-management-and-venv&#34;&gt;Package Management and VENV
&lt;/h3&gt;&lt;p&gt;You will need the &lt;a class=&#34;link&#34; href=&#34;https://docs.astral.sh/uv/getting-started/installation/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;code&gt;uv&lt;/code&gt; package manager&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;clone-the-repo&#34;&gt;Clone the Repo
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/openmind/OM1.git
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; OM1
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git submodule update --init
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;uv venv
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;install-dependencies&#34;&gt;Install Dependencies
&lt;/h3&gt;&lt;p&gt;For MacOS&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;brew install portaudio ffmpeg
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For Linux&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;sudo apt-get update
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;sudo apt-get install portaudio19-dev python-dev ffmpeg
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;obtain-an-openmind-api-key&#34;&gt;Obtain an OpenMind API Key
&lt;/h3&gt;&lt;p&gt;Obtain your API Key at &lt;a class=&#34;link&#34; href=&#34;https://portal.openmind.org/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenMind Portal&lt;/a&gt;. Copy it to &lt;code&gt;config/spot.json5&lt;/code&gt;, replacing the &lt;code&gt;openmind_free&lt;/code&gt; placeholder. Or, &lt;code&gt;cp env.example .env&lt;/code&gt; and add your key to the &lt;code&gt;.env&lt;/code&gt;.&lt;/p&gt;
&lt;h3 id=&#34;launching-om1&#34;&gt;Launching OM1
&lt;/h3&gt;&lt;p&gt;Run&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;uv run src/run.py spot
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;After launching OM1, the Spot agent will interact with you and perform (simulated) actions. For more help connecting OM1 to your robot hardware, see &lt;a class=&#34;link&#34; href=&#34;https://docs.openmind.org/getting-started&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;getting started&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;whats-next&#34;&gt;What&amp;rsquo;s Next?
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;Try out some &lt;a class=&#34;link&#34; href=&#34;https://docs.openmind.org/examples&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;examples&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Add new &lt;code&gt;inputs&lt;/code&gt; and &lt;code&gt;actions&lt;/code&gt;.&lt;/li&gt;
&lt;li&gt;Design custom agents and robots by creating your own &lt;code&gt;json5&lt;/code&gt; config files with custom combinations of inputs and actions.&lt;/li&gt;
&lt;li&gt;Change the system prompts in the configuration files (located in &lt;code&gt;/config/&lt;/code&gt;) to create new behaviors.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;interfacing-with-new-robot-hardware&#34;&gt;Interfacing with New Robot Hardware
&lt;/h2&gt;&lt;p&gt;OM1 assumes that robot hardware provides a high-level SDK that accepts elemental movement and action commands such as &lt;code&gt;backflip&lt;/code&gt;, &lt;code&gt;run&lt;/code&gt;, &lt;code&gt;gently pick up the red apple&lt;/code&gt;, &lt;code&gt;move(0.37, 0, 0)&lt;/code&gt;, and &lt;code&gt;smile&lt;/code&gt;. An example is provided in &lt;code&gt;actions/move_safe/connector/ros2.py&lt;/code&gt;:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;o&#34;&gt;...&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;elif&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;output_interface&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;action&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;==&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;shake paw&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;bp&#34;&gt;self&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sport_client&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;bp&#34;&gt;self&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sport_client&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;Hello&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;o&#34;&gt;...&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;If your robot hardware does not yet provide a suitable HAL (hardware abstraction layer), traditional robotics approaches such as RL (reinforcement learning) in concert with suitable simulation environments (Unity, Gazebo), sensors (such as hand mounted ZED depth cameras), and custom VLAs will be needed for you to create one. It is further assumed that your HAL accepts motion trajectories, provides battery and thermal management/monitoring, and calibrates and tunes sensors such as IMUs, LIDARs, and magnetometers.&lt;/p&gt;
&lt;p&gt;OM1 can interface with your HAL via USB, serial, ROS2, CycloneDDS, Zenoh, or websockets. For an example of an advanced humanoid HAL, please see &lt;a class=&#34;link&#34; href=&#34;https://github.com/unitreerobotics/unitree_sdk2/blob/adee312b081c656ecd0bb4e936eed96325546296/example/g1/high_level/g1_loco_client_example.cpp#L159&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Unitree&amp;rsquo;s C++ SDK&lt;/a&gt;. Frequently, a HAL, especially ROS2 code, will be dockerized and can then interface with OM1 through DDS middleware or websockets.&lt;/p&gt;
&lt;h2 id=&#34;recommended-development-platforms&#34;&gt;Recommended Development Platforms
&lt;/h2&gt;&lt;p&gt;OM1 is developed on:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Jetson AGX Orin 64GB (running Ubuntu 22.04 and JetPack 6.1)&lt;/li&gt;
&lt;li&gt;Mac Studio with Apple M2 Ultra with 48 GB unified memory (running MacOS Sequoia)&lt;/li&gt;
&lt;li&gt;Mac Mini with Apple M4 Pro with 48 GB unified memory (running MacOS Sequoia)&lt;/li&gt;
&lt;li&gt;Generic Linux machines (running Ubuntu 22.04)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;OM1 &lt;em&gt;should&lt;/em&gt; run on other platforms (such as Windows) and microcontrollers such as the Raspberry Pi 5 16GB.&lt;/p&gt;
&lt;h2 id=&#34;full-autonomy-guidance&#34;&gt;Full Autonomy Guidance
&lt;/h2&gt;&lt;p&gt;We&amp;rsquo;re excited to introduce &lt;strong&gt;full autonomy mode&lt;/strong&gt;, where three services work together in a loop without manual intervention:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;om1&lt;/strong&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;unitree_go2_ros2_sdk&lt;/strong&gt; – A ROS 2 package that provides SLAM (Simultaneous Localization and Mapping) capabilities for the Unitree Go2 robot using an RPLiDAR sensor, the SLAM Toolbox and the Nav2 stack.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;om1-avatar&lt;/strong&gt; – A modern React-based frontend application that provides the user interface and avatar display system for OM1 robotics software.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;intro-to-backpack&#34;&gt;Intro to Backpack?
&lt;/h2&gt;&lt;p&gt;From research to real-world autonomy, a platform that learns, moves, and builds with you.
We&amp;rsquo;ll shortly be releasing the &lt;strong&gt;BOM&lt;/strong&gt; and details on &lt;strong&gt;DIY&lt;/strong&gt; for the it.
Stay tuned!&lt;/p&gt;
&lt;p&gt;Clone the following repos -&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenMind/OM1.git&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/OpenMind/OM1.git&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenMind/unitree_go2_ros2_sdk.git&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/OpenMind/unitree_go2_ros2_sdk.git&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenMind/OM1-avatar.git&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/OpenMind/OM1-avatar.git&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;starting-the-system&#34;&gt;Starting the system
&lt;/h2&gt;&lt;p&gt;To start all services, run the following commands:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;For OM1&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; OM1
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker-compose up om1 -d --no-build
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ul&gt;
&lt;li&gt;For unitree_go2_ros2_sdk&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; unitree_go2_ros2_sdk
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker-compose up orchestrator -d --no-build
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker-compose up om1_sensor -d --no-build
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker-compose up watchdog -d --no-build
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ul&gt;
&lt;li&gt;For OM1-avatar&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; OM1-avatar
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker-compose up om1_avatar -d --no-build
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;detailed-documentation&#34;&gt;Detailed Documentation
&lt;/h2&gt;&lt;p&gt;More detailed documentation can be accessed at &lt;a class=&#34;link&#34; href=&#34;https://docs.openmind.org/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;docs.openmind.org&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;contributing&#34;&gt;Contributing
&lt;/h2&gt;&lt;p&gt;Please make sure to read the &lt;a class=&#34;link&#34; href=&#34;./CONTRIBUTING.md&#34; &gt;Contributing Guide&lt;/a&gt; before making a pull request.&lt;/p&gt;
&lt;h2 id=&#34;license&#34;&gt;License
&lt;/h2&gt;&lt;p&gt;This project is licensed under the terms of the MIT License, which is a permissive free software license that allows users to freely use, modify, and distribute the software. The MIT License is a widely used and well-established license that is known for its simplicity and flexibility. By using the MIT License, this project aims to encourage collaboration, modification, and distribution of the software.&lt;/p&gt;
</description>
        </item>
        <item>
        <title>ten-framework</title>
        <link>https://producthunt.programnotes.cn/en/p/ten-framework/</link>
        <pubDate>Fri, 19 Sep 2025 15:27:39 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/ten-framework/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1557547190-89ae9e79327a?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTgyNjY3ODh8&amp;ixlib=rb-4.1.0" alt="Featured image of post ten-framework" /&gt;&lt;h1 id=&#34;ten-frameworkten-framework&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN-framework/ten-framework&lt;/a&gt;
&lt;/h1&gt;&lt;div align=&#34;center&#34;&gt; &lt;a name=&#34;readme-top&#34;&gt;&lt;/a&gt;
&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/7c8f72d7-3993-4d01-8504-b71578a22944&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN banner&#34;
	
	
&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/releases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/v/release/ten-framework/ten-framework?color=369eff&amp;amp;labelColor=gray&amp;amp;logo=github&amp;amp;style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN Releases&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/releases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/release-date/ten-framework/ten-framework?labelColor=gray&amp;amp;style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/discussions/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/discussions/TEN-framework/ten_framework?labelColor=gray&amp;amp;color=%20%23f79009&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Discussion posts&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/graphs/commit-activity&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/commit-activity/m/TEN-framework/ten_framework?labelColor=gray&amp;amp;color=pink&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Commits&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/issues&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/issues-search?query=repo%3ATEN-framework%2Ften-framework%20is%3Aclosed&amp;amp;label=issues%20closed&amp;amp;labelColor=gray&amp;amp;color=green&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Issues closed&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/graphs/contributors&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/contributors/ten-framework/ten-framework?color=c4f042&amp;amp;labelColor=gray&amp;amp;style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/pulls&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/PRs-welcome!-brightgreen.svg?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;PRs Welcome&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten_framework/blob/main/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/License-Apache_2.0_with_certain_conditions-blue.svg?labelColor=%20%23155EEF&amp;amp;color=%20%23528bff&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub license&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://deepwiki.com/TEN-framework/TEN-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://deepwiki.com/badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Ask DeepWiki&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://readmex.com/TEN-framework/ten-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://raw.githubusercontent.com/CodePhiliaX/resource-trusteeship/main/readmex.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;ReadmeX&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://GitHub.com/TEN-framework/ten_framework/watchers/?WT.mc_id=academic-105485-koreyst&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/watchers/TEN-framework/ten_framework?style=social&amp;amp;label=Watch&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub watchers&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://GitHub.com/TEN-framework/ten_framework/network/?WT.mc_id=academic-105485-koreyst&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/forks/TEN-framework/ten_framework?style=social&amp;amp;label=Fork&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub forks&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://GitHub.com/TEN-framework/ten_framework/stargazers/?WT.mc_id=academic-105485-koreyst&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/stars/TEN-framework/ten_framework?style=social&amp;amp;label=Star&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub stars&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/README.md&#34;&gt;&lt;img alt=&#34;README in English&#34; src=&#34;https://img.shields.io/badge/English-lightgrey&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/docs/README-CN.md&#34;&gt;&lt;img alt=&#34;简体中文操作指南&#34; src=&#34;https://img.shields.io/badge/简体中文-lightgrey&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/docs/README-JP.md&#34;&gt;&lt;img alt=&#34;日本語のREADME&#34; src=&#34;https://img.shields.io/badge/日本語-lightgrey&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/docs/README-KR.md&#34;&gt;&lt;img alt=&#34;README in 한국어&#34; src=&#34;https://img.shields.io/badge/한국어-lightgrey&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/docs/README-ES.md&#34;&gt;&lt;img alt=&#34;README en Español&#34; src=&#34;https://img.shields.io/badge/Español-lightgrey&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/docs/README-FR.md&#34;&gt;&lt;img alt=&#34;README en Français&#34; src=&#34;https://img.shields.io/badge/Français-lightgrey&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/TEN-framework/ten-framework/blob/main/docs/README-IT.md&#34;&gt;&lt;img alt=&#34;README in Italiano&#34; src=&#34;https://img.shields.io/badge/Italiano-lightgrey&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://theten.ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Official Site&lt;/a&gt;
•
&lt;a class=&#34;link&#34; href=&#34;https://theten.ai/docs/ten_agent/overview&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Documentation&lt;/a&gt;
•
&lt;a class=&#34;link&#34; href=&#34;https://theten.ai/blog&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Blog&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a href=&#34;https://trendshift.io/repositories/11978&#34; target=&#34;_blank&#34;&gt;&lt;img src=&#34;https://trendshift.io/api/badge/repositories/11978&#34; alt=&#34;TEN-framework%2Ften_framework | Trendshift&#34; style=&#34;width: 250px; height: 55px;&#34; width=&#34;250&#34; height=&#34;55&#34;/&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;details&gt;
  &lt;summary&gt;&lt;kbd&gt;Table of Contents&lt;/kbd&gt;&lt;/summary&gt;
&lt;h4 id=&#34;table-of-contents&#34;&gt;Table of Contents
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-welcome-to-ten&#34; &gt;👋 Welcome to TEN&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-tman-designer&#34; &gt;🎨 TMAN Designer&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-features&#34; &gt;✨ Features&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#1%ef%b8%8f%e2%83%a3-real-time-avatar&#34; &gt;1️⃣ Real-time Avatar&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#2%ef%b8%8f%e2%83%a3-real-time-voice-with-mcp-servers&#34; &gt;2️⃣ Real-time voice with MCP servers&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#3%ef%b8%8f%e2%83%a3-real-time-communication-with-hardware&#34; &gt;3️⃣ Real-time communication with hardware&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#4%ef%b8%8f%e2%83%a3-real-time-vision-and-real-time-screenshare-detection&#34; &gt;4️⃣ Real-time vision and real-time screenshare detection&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#5%ef%b8%8f%e2%83%a3-ten-with-other-llm-platforms&#34; &gt;5️⃣ TEN with other LLM platforms&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#6%ef%b8%8f%e2%83%a3-storyteller---ten-image-generation&#34; &gt;6️⃣ StoryTeller - TEN image generation&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-get-ten-agent-up-and-running&#34; &gt;👩‍💻 Get TEN Agent up and running&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#%f0%9f%85%b0%ef%b8%8f-run-ten-agent-in-localhost&#34; &gt;🅰️ Run TEN Agent in &lt;code&gt;localhost&lt;/code&gt;&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#%f0%9f%85%b1%ef%b8%8f-run-ten-agent-in-codespaceno-docker&#34; &gt;🅱️ Run TEN Agent in Codespace(no docker)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#%ef%b8%8f-ten-agent-self-hosting&#34; &gt;🛳️ TEN Agent Self Hosting&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#%f0%9f%85%b0%ef%b8%8f-deploying-with-docker&#34; &gt;🅰️ Deploying with Docker&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#%f0%9f%85%b1%ef%b8%8f-deploying-with-other-cloud-services&#34; &gt;🅱️ Deploying with other cloud services&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-ten-ecosystem&#34; &gt;🌍 TEN Ecosystem&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-ask-questions&#34; &gt;❓ Ask Questions&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#-contributing&#34; &gt;🥰 Contributing&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#code-contributors&#34; &gt;Code Contributors&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#contribution-guidelines&#34; &gt;Contribution Guidelines&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#license&#34; &gt;License&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;br/&gt;
&lt;/details&gt;
&lt;h2 id=&#34;-welcome-to-ten&#34;&gt;👋 Welcome to TEN
&lt;/h2&gt;&lt;p&gt;TEN is a comprehensive open-source ecosystem for creating, customizing, and deploying real-time conversational AI agents with multimodal capabilities including voice, vision, and avatar interactions.&lt;/p&gt;
&lt;p&gt;TEN includes &lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/ten-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN Framework&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/ten-turn-detection&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN Turn Detection&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/ten-vad&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN VAD&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/tree/main/ai_agents/demo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN Agent&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/tree/main/core/src/ten_manager/designer_frontend&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TMAN Designer&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/portal&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN Portal&lt;/a&gt;. Check out &lt;a class=&#34;link&#34; href=&#34;#-ten-ecosystem&#34; &gt;🌍 TEN Ecosystem&lt;/a&gt; for more details.&lt;/p&gt;
&lt;br&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Community Channel&lt;/th&gt;
          &lt;th&gt;Purpose&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://twitter.com/intent/follow?screen_name=TenFramework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/twitter/follow/TenFramework?logo=X&amp;amp;color=%20%23f5f5f5&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Follow on X&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Follow TEN Framework on X for updates and announcements&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.linkedin.com/company/ten-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://custom-icon-badges.demolab.com/badge/LinkedIn-TEN_Framework-0A66C2?logo=linkedin-white&amp;amp;logoColor=fff&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Follow on LinkedIn&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Follow TEN Framework on LinkedIn for updates and announcements&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://discord.gg/VnPftUzAMJ&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/Discord-Join%20TEN%20Community-5865F2?style=flat&amp;amp;logo=discord&amp;amp;logoColor=white&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Discord TEN Community&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Join our Discord community to connect with developers&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/TEN-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/Hugging%20Face-TEN%20Framework-yellow?style=flat&amp;amp;logo=huggingface&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Hugging Face Space&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Join our Hugging Face community to explore our spaces and models&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-agent/discussions/170&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/TEN_Framework-WeChat_Group-%2307C160?logo=wechat&amp;amp;labelColor=darkgreen&amp;amp;color=gray&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;WeChat&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Join our WeChat group for Chinese community discussions&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;br&gt;
&lt;blockquote&gt;
&lt;p&gt;[!IMPORTANT]&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Star TEN Repositories&lt;/strong&gt; ⭐️&lt;/p&gt;
&lt;p&gt;Get instant notifications for new releases and updates. Your support helps us grow and improve TEN!&lt;/p&gt;
&lt;/blockquote&gt;
&lt;br&gt;
&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/eeebe996-8c14-4bf7-82ae-f1a1f7e30705&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN star us gif&#34;
	
	
&gt;&lt;/p&gt;
&lt;br&gt;
&lt;details&gt;
  &lt;summary&gt;&lt;kbd&gt;Star History&lt;/kbd&gt;&lt;/summary&gt;
  &lt;picture&gt;
    &lt;img width=&#34;100%&#34; src=&#34;https://api.star-history.com/svg?repos=ten-framework/ten-framework&amp;type=Date&#34;&gt;
  &lt;/picture&gt;
&lt;/details&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h2 id=&#34;-tman-designer&#34;&gt;🎨 TMAN Designer
&lt;/h2&gt;&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/user-attachments/assets/44c6a087-ec7a-45b0-a084-dab5dac5e36b&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/user-attachments/assets/44c6a087-ec7a-45b0-a084-dab5dac5e36b&lt;/a&gt;&lt;/p&gt;
&lt;h3 id=&#34;tman-designer&#34;&gt;TMAN Designer
&lt;/h3&gt;&lt;p&gt;TMAN Designer is a low/no-code option to create voice agents with an easy-to-use workflow UI. It can load apps and graphs, and includes an online editor, log viewer, and much more.&lt;/p&gt;
&lt;p&gt;Check out &lt;a class=&#34;link&#34; href=&#34;https://theten.ai/blog/tman-designer-of-ten-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this blog&lt;/a&gt; for more details.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h2 id=&#34;-features&#34;&gt;✨ Features
&lt;/h2&gt;&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/c6702995-de94-4d3e-8cae-af097f087ac1&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN Agent with Trulience&#34;
	
	
&gt;&lt;/p&gt;
&lt;h3 id=&#34;1-real-time-avatar&#34;&gt;1️⃣ Real-time Avatar
&lt;/h3&gt;&lt;p&gt;Build engaging AI avatars with TEN Agent using &lt;a class=&#34;link&#34; href=&#34;https://trulience.com&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Trulience&lt;/a&gt;&amp;rsquo;s diverse collection of free avatar options. To get it up and running, you only need 2 steps:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Follow the README to finish setting up and running the Playground&lt;/li&gt;
&lt;li&gt;Enter the avatar ID and &lt;a class=&#34;link&#34; href=&#34;https://trulience.com/docs#/authentication/jwt-tokens/jwt-tokens?id=use-your-custom-userid&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;token&lt;/a&gt; you get from &lt;a class=&#34;link&#34; href=&#34;https://trulience.com&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Trulience&lt;/a&gt;&lt;/li&gt;
&lt;/ol&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/afb77ad3-9c23-452f-b870-216687779017&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN with MCP servers&#34;
	
	
&gt;&lt;/p&gt;
&lt;h3 id=&#34;2-real-time-voice-with-mcp-servers&#34;&gt;2️⃣ Real-time voice with MCP servers
&lt;/h3&gt;&lt;p&gt;TEN Agent now integrates seamlessly with MCP servers, expanding its LLM capabilities. To get started:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Open the Module Picker in Playground&lt;/li&gt;
&lt;li&gt;Add the MCP server tool for LLM integration&lt;/li&gt;
&lt;li&gt;Paste a URL from your MCP server in the extension&lt;/li&gt;
&lt;li&gt;Start a realtime conversation with TEN Agent&lt;/li&gt;
&lt;/ol&gt;
&lt;p&gt;This integration allows you to leverage MCP&amp;rsquo;s diverse servers offerings while maintaining TEN Agent&amp;rsquo;s powerful conversational abilities.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/user-attachments/assets/78647eef-2d66-44e6-99a8-1918a940fb9f&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/user-attachments/assets/78647eef-2d66-44e6-99a8-1918a940fb9f&lt;/a&gt;&lt;/p&gt;
&lt;h3 id=&#34;3-real-time-communication-with-hardware&#34;&gt;3️⃣ Real-time communication with hardware
&lt;/h3&gt;&lt;p&gt;TEN Agent is now running on the Espressif ESP32-S3 Korvo V3 development board, an excellent way to integrate realtime communication with LLM on hardware.&lt;/p&gt;
&lt;p&gt;Check out the &lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/tree/main/ai_agents/esp32-client&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;integration guide&lt;/a&gt; for more details.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/a1addb02-a450-47be-8cb2-d25e3b574f53&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Real-time Vision&#34;
	
	
&gt;&lt;/p&gt;
&lt;h3 id=&#34;4-real-time-vision-and-real-time-screenshare-detection&#34;&gt;4️⃣ Real-time vision and real-time screenshare detection
&lt;/h3&gt;&lt;p&gt;Try Google Gemini Multimodal Live API with realtime vision and realtime screenshare detection capabilities, it is a ready-to-use extension, along with powerful tools like Weather Check and Web Search integrated perfectly into TEN Agent.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/234ff443-bef8-4cc4-9a10-09d6ec3f5bc1&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN with Dify&#34;
	
	
&gt;&lt;/p&gt;
&lt;h3 id=&#34;5-ten-with-other-llm-platforms&#34;&gt;5️⃣ TEN with other LLM platforms
&lt;/h3&gt;&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://theten.ai/docs/ten_agent/playground/use-cases/voice-assistant/run_dify#steps&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TEN Agent + Dify&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;TEN offers a great support to make the realtime interactive experience even better on other LLM platform as well, check out docs for more.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;p&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/fe28a549-ddb9-431e-9282-57539fb87371&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN StoryTeller&#34;
	
	
&gt;&lt;/p&gt;
&lt;h3 id=&#34;6-storyteller---ten-image-generation&#34;&gt;6️⃣ StoryTeller - TEN image generation
&lt;/h3&gt;&lt;p&gt;Experience the real-time image generation with StoryTeller, it is a ready-to-use extension, along with powerful tools like Weather Check and Web Search integrated perfectly into TEN.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h2 id=&#34;-get-ten-agent-up-and-running&#34;&gt;👩‍💻 Get TEN Agent up and running
&lt;/h2&gt;&lt;h4 id=&#34;-run-ten-agent-in-localhost&#34;&gt;🅰️ Run TEN Agent in localhost
&lt;/h4&gt;&lt;h4 id=&#34;step----prerequisites&#34;&gt;Step ⓵ - Prerequisites
&lt;/h4&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Category&lt;/th&gt;
          &lt;th&gt;Requirements&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;strong&gt;Keys&lt;/strong&gt;&lt;/td&gt;
          &lt;td&gt;• Agora &lt;a class=&#34;link&#34; href=&#34;https://docs.agora.io/en/video-calling/get-started/manage-agora-account?platform=web#create-an-agora-project&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;App ID&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://docs.agora.io/en/video-calling/get-started/manage-agora-account?platform=web#create-an-agora-project&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;App Certificate&lt;/a&gt; (free minutes every month) &lt;br&gt;• &lt;a class=&#34;link&#34; href=&#34;https://openai.com/index/openai-api/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenAI&lt;/a&gt; API key (any LLM that is compatible with OpenAI)&lt;br&gt;• &lt;a class=&#34;link&#34; href=&#34;https://deepgram.com/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Deepgram&lt;/a&gt; ASR (free credits available with signup)&lt;br&gt;• &lt;a class=&#34;link&#34; href=&#34;https://elevenlabs.io/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Elevenlabs&lt;/a&gt; TTS (free credits available with signup)&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;strong&gt;Installation&lt;/strong&gt;&lt;/td&gt;
          &lt;td&gt;• &lt;a class=&#34;link&#34; href=&#34;https://www.docker.com/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docker&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://docs.docker.com/compose/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docker Compose&lt;/a&gt;&lt;br&gt;• &lt;a class=&#34;link&#34; href=&#34;https://nodejs.org/en&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Node.js(LTS) v18&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;strong&gt;Minimum System Requirements&lt;/strong&gt;&lt;/td&gt;
          &lt;td&gt;• CPU &amp;gt;= 2 Core&lt;br&gt;• RAM &amp;gt;= 4 GB&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;br&gt;
&lt;blockquote&gt;
&lt;p&gt;[!NOTE]&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;macOS: Docker setting on Apple Silicon&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Uncheck &amp;ldquo;Use Rosetta for x86/amd64 emulation&amp;rdquo; in Docker settings, it may result in slower build times on ARM, but performance will be normal when deployed to x64 servers.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;br&gt;
&lt;h4 id=&#34;step----build-agent-in-vm&#34;&gt;Step ⓶ - Build agent in VM
&lt;/h4&gt;&lt;h5 id=&#34;1-clone-down-the-repocd-to-ai-agents-and-create-env-file-from-envexample&#34;&gt;1. Clone down the repo,&lt;code&gt;cd&lt;/code&gt; to &lt;code&gt;ai-agents&lt;/code&gt; and create &lt;code&gt;.env&lt;/code&gt; file from &lt;code&gt;.env.example&lt;/code&gt;
&lt;/h5&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; ai_agents
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;cp ./.env.example ./.env
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h5 id=&#34;2-setup-agora-app-id-and-app-certificate-in-env&#34;&gt;2. Setup Agora App ID and App Certificate in &lt;code&gt;.env&lt;/code&gt;
&lt;/h5&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;AGORA_APP_ID&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;AGORA_APP_CERTIFICATE&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h5 id=&#34;3-start-agent-development-containers&#34;&gt;3. Start agent development containers
&lt;/h5&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose up -d
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h5 id=&#34;4-enter-container&#34;&gt;4. Enter container
&lt;/h5&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; -it ten_agent_dev bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h5 id=&#34;5-build-agent-with-the-default-graph--5min---8min&#34;&gt;5. Build agent with the default &lt;code&gt;graph&lt;/code&gt; ( ~5min - ~8min)
&lt;/h5&gt;&lt;p&gt;check the &lt;code&gt;/examples&lt;/code&gt; folder for more examples&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# use the chained voice assistant&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;task use &lt;span class=&#34;nv&#34;&gt;AGENT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;voice-assistant
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# or use the speech-to-speech voice assistant realtime&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;task use &lt;span class=&#34;nv&#34;&gt;AGENT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;voice-assistant-realtime
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h5 id=&#34;6-start-the-web-server&#34;&gt;6. Start the web server
&lt;/h5&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# run task build if you changed any local source code, this is necessary if you are working on languages which require compilation like TypeScript or Golang.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;task build
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;task run
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;br&gt;
&lt;h4 id=&#34;step----customize-your-agent-with-tman-designer&#34;&gt;Step ⓷ - Customize your agent with TMAN Designer
&lt;/h4&gt;&lt;ol&gt;
&lt;li&gt;Open &lt;a class=&#34;link&#34; href=&#34;http://localhost:49483&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;localhost:49483&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Right-click on the STT, LLM, and TTS extensions.&lt;/li&gt;
&lt;li&gt;Open their properties and enter APIs respectively.&lt;/li&gt;
&lt;li&gt;Right-click the canvas and select &amp;lsquo;Manage Apps&amp;rsquo; to open the Apps Manager.&lt;/li&gt;
&lt;li&gt;Right under the Actions, click the ▶ to run the App.&lt;/li&gt;
&lt;li&gt;Check the &amp;lsquo;Run with TEN Agent&amp;rsquo; option and click the Run button.&lt;/li&gt;
&lt;/ol&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h3 id=&#34;-run-ten-agent-in-codespaceno-docker&#34;&gt;🅱️ Run TEN Agent in Codespace(no docker)
&lt;/h3&gt;&lt;p&gt;GitHub offers free Codespace for each repository, you can run the playground in Codespace without using Docker.Also, the speed of Codespace is much faster than localhost.&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://codespaces.new/ten-framework/ten-agent&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://github.com/codespaces/badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Check out &lt;a class=&#34;link&#34; href=&#34;https://theten.ai/docs/ten_agent/setup_development_env/setting_up_development_inside_codespace&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this guide&lt;/a&gt; for more details.&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h2 id=&#34;-ten-agent-self-hosting&#34;&gt;🛳️ TEN Agent Self Hosting
&lt;/h2&gt;&lt;h4 id=&#34;-deploying-with-docker&#34;&gt;🅰️ Deploying with Docker
&lt;/h4&gt;&lt;p&gt;Once you have customized your agent (either by using the TMAN Manager, Playground, or editing &lt;code&gt;property.json&lt;/code&gt; directly), you can deploy it by creating a release Docker image for your service.&lt;/p&gt;
&lt;p&gt;Read the &lt;a class=&#34;link&#34; href=&#34;https://theten.ai/docs/ten_agent/deploy_ten_agent/deploy_agent_service&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Deployment Guide&lt;/a&gt; for detailed information about deployment.&lt;/p&gt;
&lt;br&gt;
&lt;h4 id=&#34;-deploying-with-other-cloud-services&#34;&gt;🅱️ Deploying with other cloud services
&lt;/h4&gt;&lt;p&gt;&lt;em&gt;coming soon&lt;/em&gt;&lt;/p&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h2 id=&#34;-ten-ecosystem&#34;&gt;🌍 TEN Ecosystem
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Project&lt;/th&gt;
          &lt;th&gt;Preview&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/ten_framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;🏚️ TEN Framework&lt;/strong&gt;&lt;/a&gt;&lt;br&gt;TEN is an open-source framework for real-time, multimodal conversational AI.&lt;br&gt;&lt;br&gt;&lt;img src=&#34;https://img.shields.io/github/stars/ten-framework/ten_framework?color=ffcb47&amp;amp;labelColor=gray&amp;amp;style=flat-square&amp;amp;logo=github&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/7c8f72d7-3993-4d01-8504-b71578a22944&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/ten-turn-detection&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;️🔂 TEN Turn Detection&lt;/strong&gt;&lt;/a&gt;&lt;br&gt;TEN is for full-duplex dialogue communication.&lt;br&gt;&lt;br&gt;&lt;img src=&#34;https://img.shields.io/github/stars/ten-framework/ten-turn-detection?color=ffcb47&amp;amp;labelColor=gray&amp;amp;style=flat-square&amp;amp;logo=github&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/8d0ec716-5d0e-43e4-ad9a-d97b17305658&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/ten-vad&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;🔉 TEN VAD&lt;/strong&gt;&lt;/a&gt;&lt;br&gt;TEN VAD is a low-latency, lightweight and high-performance streaming voice activity detector (VAD).&lt;br&gt;&lt;br&gt;&lt;img src=&#34;https://img.shields.io/github/stars/ten-framework/ten-vad?color=ffcb47&amp;amp;labelColor=gray&amp;amp;style=flat-square&amp;amp;logo=github&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/d45870e4-9453-4047-8163-08737f82863f&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/tree/main/ai_agents&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;🎙️ TEN Agent&lt;/strong&gt;&lt;/a&gt;&lt;br&gt;TEN Agent is a showcase of TEN Framewrok.&lt;br&gt;&lt;br&gt;&lt;/td&gt;
          &lt;td&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/38de2207-939b-4702-a0aa-04491f5b5275&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-framework/tree/main/core/src/ten_manager/designer_frontend&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;🎨 TMAN Designer&lt;/strong&gt;&lt;/a&gt;&lt;br&gt;TMAN Designer is low/no code option to make a voice agent with easy to use workflow UI.&lt;br&gt;&lt;br&gt;&lt;/td&gt;
          &lt;td&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/804c3543-0a47-42b7-b40b-ef32b742fb8f&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ten-framework/portal&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;📒 TEN Portal&lt;/strong&gt;&lt;/a&gt;&lt;br&gt;The official site of TEN framework, it has documentation and blog.&lt;br&gt;&lt;br&gt;&lt;img src=&#34;https://img.shields.io/github/stars/ten-framework/portal?color=ffcb47&amp;amp;labelColor=gray&amp;amp;style=flat-square&amp;amp;logo=github&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td&gt;&lt;img src=&#34;https://github.com/user-attachments/assets/e17d8aaa-5928-45dd-ac71-814928e26a89&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;br&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;h2 id=&#34;-ask-questions&#34;&gt;❓ Ask Questions
&lt;/h2&gt;&lt;p&gt;TEN Framework is available on these AI-powered Q&amp;amp;A platforms. They can help you find answers quickly and accurately in multiple languages, covering everything from basic setup to advanced implementation details.&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Service&lt;/th&gt;
          &lt;th&gt;Link&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;DeepWiki&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://deepwiki.com/TEN-framework/TEN-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://deepwiki.com/badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Ask DeepWiki&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;ReadmeX&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://readmex.com/TEN-framework/ten-framework&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://raw.githubusercontent.com/CodePhiliaX/resource-trusteeship/main/readmex.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;ReadmeX&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;br&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;h2 id=&#34;-contributing&#34;&gt;🥰 Contributing
&lt;/h2&gt;&lt;p&gt;We welcome all forms of open-source collaboration! Whether you&amp;rsquo;re fixing bugs, adding features, improving documentation, or sharing ideas - your contributions help advance personalized AI tools. Check out our GitHub Issues and Projects to find ways to contribute and show your skills. Together, we can build something amazing!&lt;/p&gt;
&lt;br&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Welcome all kinds of contributions&lt;/strong&gt; 🙏&lt;/p&gt;
&lt;p&gt;Join us in building TEN better! Every contribution makes a difference, from code to documentation. Share your TEN Agent projects on social media with to inspire others!&lt;/p&gt;
&lt;p&gt;Connect with one of the TEN maintainers &lt;a class=&#34;link&#34; href=&#34;https://x.com/elliotchen100&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@elliotchen100&lt;/a&gt; on 𝕏 or &lt;a class=&#34;link&#34; href=&#34;https://github.com/cyfyifanchen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@cyfyifanchen&lt;/a&gt; on GitHub for project updates, discussions and collaboration opportunities.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;br&gt;
&lt;h3 id=&#34;code-contributors&#34;&gt;Code Contributors
&lt;/h3&gt;&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/TEN-framework/ten-agent/graphs/contributors&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://contrib.rocks/image?repo=TEN-framework/ten-agent&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;TEN&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h3 id=&#34;contribution-guidelines&#34;&gt;Contribution Guidelines
&lt;/h3&gt;&lt;p&gt;Contributions are welcome! Please read the &lt;a class=&#34;link&#34; href=&#34;./docs/code-of-conduct/contributing.md&#34; &gt;contribution guidelines&lt;/a&gt; first.&lt;/p&gt;
&lt;h3 id=&#34;license&#34;&gt;License
&lt;/h3&gt;&lt;ol&gt;
&lt;li&gt;
&lt;p&gt;The entire TEN framework (except for the folders explicitly listed below) is released under the Apache License, Version 2.0, with additional restrictions. For details, please refer to the &lt;a class=&#34;link&#34; href=&#34;./LICENSE&#34; &gt;LICENSE&lt;/a&gt; file located in the root directory of the TEN framework.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;The components within the &lt;code&gt;packages&lt;/code&gt; directory are released under the Apache License, Version 2.0. For details, please refer to the &lt;code&gt;LICENSE&lt;/code&gt; file located in each package&amp;rsquo;s root directory.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;The third-party libraries used by the TEN framework are listed and described in detail. For more information, please refer to the &lt;a class=&#34;link&#34; href=&#34;./third_party/&#34; &gt;third_party&lt;/a&gt; folder.&lt;/p&gt;
&lt;/li&gt;
&lt;/ol&gt;
&lt;div align=&#34;right&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#readme-top&#34; &gt;&lt;img src=&#34;https://img.shields.io/badge/-Back_to_top-gray?style=flat-square&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
</description>
        </item>
        <item>
        <title>transformers</title>
        <link>https://producthunt.programnotes.cn/en/p/transformers/</link>
        <pubDate>Sun, 14 Sep 2025 15:25:45 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/transformers/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1700245481730-d375ad70ff2b?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTc4MzQ2NTB8&amp;ixlib=rb-4.1.0" alt="Featured image of post transformers" /&gt;&lt;h1 id=&#34;huggingfacetransformers&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/huggingface/transformers&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;huggingface/transformers&lt;/a&gt;
&lt;/h1&gt;&lt;!---
Copyright 2020 The HuggingFace Team. All rights reserved.

Licensed under the Apache License, Version 2.0 (the &#34;License&#34;);
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an &#34;AS IS&#34; BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
--&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;picture&gt;
    &lt;source media=&#34;(prefers-color-scheme: dark)&#34; srcset=&#34;https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg&#34;&gt;
    &lt;source media=&#34;(prefers-color-scheme: light)&#34; srcset=&#34;https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg&#34;&gt;
    &lt;img alt=&#34;Hugging Face Transformers Library&#34; src=&#34;https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg&#34; width=&#34;352&#34; height=&#34;59&#34; style=&#34;max-width: 100%;&#34;&gt;
  &lt;/picture&gt;
  &lt;br/&gt;
  &lt;br/&gt;
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
    &lt;a href=&#34;https://huggingface.com/models&#34;&gt;&lt;img alt=&#34;Checkpoints on Hub&#34; src=&#34;https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&amp;color=brightgreen&#34;&gt;&lt;/a&gt;
    &lt;a href=&#34;https://circleci.com/gh/huggingface/transformers&#34;&gt;&lt;img alt=&#34;Build&#34; src=&#34;https://img.shields.io/circleci/build/github/huggingface/transformers/main&#34;&gt;&lt;/a&gt;
    &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/LICENSE&#34;&gt;&lt;img alt=&#34;GitHub&#34; src=&#34;https://img.shields.io/github/license/huggingface/transformers.svg?color=blue&#34;&gt;&lt;/a&gt;
    &lt;a href=&#34;https://huggingface.co/docs/transformers/index&#34;&gt;&lt;img alt=&#34;Documentation&#34; src=&#34;https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&amp;down_message=offline&amp;up_message=online&#34;&gt;&lt;/a&gt;
    &lt;a href=&#34;https://github.com/huggingface/transformers/releases&#34;&gt;&lt;img alt=&#34;GitHub release&#34; src=&#34;https://img.shields.io/github/release/huggingface/transformers.svg&#34;&gt;&lt;/a&gt;
    &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md&#34;&gt;&lt;img alt=&#34;Contributor Covenant&#34; src=&#34;https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg&#34;&gt;&lt;/a&gt;
    &lt;a href=&#34;https://zenodo.org/badge/latestdoi/155220641&#34;&gt;&lt;img src=&#34;https://zenodo.org/badge/155220641.svg&#34; alt=&#34;DOI&#34;&gt;&lt;/a&gt;
&lt;/p&gt;
&lt;h4 align=&#34;center&#34;&gt;
    &lt;p&gt;
        &lt;b&gt;English&lt;/b&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hans.md&#34;&gt;简体中文&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_zh-hant.md&#34;&gt;繁體中文&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_ko.md&#34;&gt;한국어&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_es.md&#34;&gt;Español&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md&#34;&gt;日本語&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md&#34;&gt;हिन्दी&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md&#34;&gt;Русский&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md&#34;&gt;Português&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_te.md&#34;&gt;తెలుగు&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md&#34;&gt;Français&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_de.md&#34;&gt;Deutsch&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_vi.md&#34;&gt;Tiếng Việt&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_ar.md&#34;&gt;العربية&lt;/a&gt; |
        &lt;a href=&#34;https://github.com/huggingface/transformers/blob/main/i18n/README_ur.md&#34;&gt;اردو&lt;/a&gt; |
    &lt;/p&gt;
&lt;/h4&gt;
&lt;h3 align=&#34;center&#34;&gt;
    &lt;p&gt;State-of-the-art pretrained models for inference and training&lt;/p&gt;
&lt;/h3&gt;
&lt;h3 align=&#34;center&#34;&gt;
    &lt;img src=&#34;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_as_a_model_definition.png&#34;/&gt;
&lt;/h3&gt;
&lt;p&gt;Transformers acts as the model-definition framework for state-of-the-art machine learning models in text, computer
vision, audio, video, and multimodal model, for both inference and training.&lt;/p&gt;
&lt;p&gt;It centralizes the model definition so that this definition is agreed upon across the ecosystem. &lt;code&gt;transformers&lt;/code&gt; is the
pivot across frameworks: if a model definition is supported, it will be compatible with the majority of training
frameworks (Axolotl, Unsloth, DeepSpeed, FSDP, PyTorch-Lightning, &amp;hellip;), inference engines (vLLM, SGLang, TGI, &amp;hellip;),
and adjacent modeling libraries (llama.cpp, mlx, &amp;hellip;) which leverage the model definition from &lt;code&gt;transformers&lt;/code&gt;.&lt;/p&gt;
&lt;p&gt;We pledge to help support new state-of-the-art models and democratize their usage by having their model definition be
simple, customizable, and efficient.&lt;/p&gt;
&lt;p&gt;There are over 1M+ Transformers &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?library=transformers&amp;amp;sort=trending&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;model checkpoints&lt;/a&gt; on the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.com/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hugging Face Hub&lt;/a&gt; you can use.&lt;/p&gt;
&lt;p&gt;Explore the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.com/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hub&lt;/a&gt; today to find a model and use Transformers to help you get started right away.&lt;/p&gt;
&lt;h2 id=&#34;installation&#34;&gt;Installation
&lt;/h2&gt;&lt;p&gt;Transformers works with Python 3.9+ &lt;a class=&#34;link&#34; href=&#34;https://pytorch.org/get-started/locally/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PyTorch&lt;/a&gt; 2.1+, &lt;a class=&#34;link&#34; href=&#34;https://www.tensorflow.org/install/pip&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TensorFlow&lt;/a&gt; 2.6+, and &lt;a class=&#34;link&#34; href=&#34;https://flax.readthedocs.io/en/latest/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Flax&lt;/a&gt; 0.4.1+.&lt;/p&gt;
&lt;p&gt;Create and activate a virtual environment with &lt;a class=&#34;link&#34; href=&#34;https://docs.python.org/3/library/venv.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;venv&lt;/a&gt; or &lt;a class=&#34;link&#34; href=&#34;https://docs.astral.sh/uv/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;uv&lt;/a&gt;, a fast Rust-based Python package and project manager.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# venv&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;python&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;m&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;venv&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;my&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;env&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;source&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;my&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;env&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;/&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;bin&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;/&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;activate&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# uv&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;uv&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;venv&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;my&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;env&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;source&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;my&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;env&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;/&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;bin&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;/&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;activate&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Install Transformers in your virtual environment.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# pip&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pip&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;install&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;transformers[torch]&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# uv&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;uv&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pip&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;install&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;transformers[torch]&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the &lt;em&gt;latest&lt;/em&gt; version may not be stable. Feel free to open an &lt;a class=&#34;link&#34; href=&#34;https://github.com/huggingface/transformers/issues&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;issue&lt;/a&gt; if you encounter an error.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/huggingface/transformers.git
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; transformers
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# pip&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install .&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;torch&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# uv&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;uv pip install .&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;torch&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;quickstart&#34;&gt;Quickstart
&lt;/h2&gt;&lt;p&gt;Get started with Transformers right away with the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/docs/transformers/pipeline_tutorial&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Pipeline&lt;/a&gt; API. The &lt;code&gt;Pipeline&lt;/code&gt; is a high-level inference class that supports text, audio, vision, and multimodal tasks. It handles preprocessing the input and returns the appropriate output.&lt;/p&gt;
&lt;p&gt;Instantiate a pipeline and specify model to use for text generation. The model is downloaded and cached so you can easily reuse it again. Finally, pass some text to prompt the model.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;task&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;text-generation&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;Qwen/Qwen2.5-1.5B&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;the secret to baking a really good cake is &amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;generated_text&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to &lt;code&gt;Pipeline&lt;/code&gt;) between you and the system.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
You can also chat with a model directly from the command line.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;transformers chat Qwen/Qwen2.5-0.5B-Instruct
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/blockquote&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;role&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;system&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986.&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;role&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;user&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Hey, can you tell me any fun things to do in New York?&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;task&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;text-generation&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;meta-llama/Meta-Llama-3-8B-Instruct&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;device_map&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;auto&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;response&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;512&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;response&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;][&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;generated_text&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;][&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;][&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;])&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Expand the examples below to see how &lt;code&gt;Pipeline&lt;/code&gt; works for different modalities and tasks.&lt;/p&gt;
&lt;details&gt;
&lt;summary&gt;Automatic speech recognition&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;task&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;automatic-speech-recognition&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;openai/whisper-large-v3&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;text&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39; I have a dream that one day this nation will rise up and live out the true meaning of its creed.&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Image classification&lt;/summary&gt;
&lt;h3 align=&#34;center&#34;&gt;
    &lt;a&gt;&lt;img src=&#34;https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png&#34;&gt;&lt;/a&gt;
&lt;/h3&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;task&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;image-classification&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;facebook/dinov2-small-imagenet1k-1-layer&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;label&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;macaw&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;score&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;mf&#34;&gt;0.997848391532898&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;label&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;s1&#34;&gt;&amp;#39;score&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;mf&#34;&gt;0.0016551691805943847&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;label&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;lorikeet&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;score&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;mf&#34;&gt;0.00018523589824326336&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;label&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;African grey, African gray, Psittacus erithacus&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;s1&#34;&gt;&amp;#39;score&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;mf&#34;&gt;7.85409429227002e-05&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;label&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;quail&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;score&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;mf&#34;&gt;5.502637941390276e-05&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Visual question answering&lt;/summary&gt;
&lt;h3 align=&#34;center&#34;&gt;
    &lt;a&gt;&lt;img src=&#34;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg&#34;&gt;&lt;/a&gt;
&lt;/h3&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-py&#34; data-lang=&#34;py&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;task&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;visual-question-answering&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;Salesforce/blip-vqa-base&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;pipeline&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;What is in the image?&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;answer&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;statue of liberty&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h2 id=&#34;why-should-i-use-transformers&#34;&gt;Why should I use Transformers?
&lt;/h2&gt;&lt;ol&gt;
&lt;li&gt;
&lt;p&gt;Easy-to-use state-of-the-art models:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;High performance on natural language understanding &amp;amp; generation, computer vision, audio, video, and multimodal tasks.&lt;/li&gt;
&lt;li&gt;Low barrier to entry for researchers, engineers, and developers.&lt;/li&gt;
&lt;li&gt;Few user-facing abstractions with just three classes to learn.&lt;/li&gt;
&lt;li&gt;A unified API for using all our pretrained models.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Lower compute costs, smaller carbon footprint:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Share trained models instead of training from scratch.&lt;/li&gt;
&lt;li&gt;Reduce compute time and production costs.&lt;/li&gt;
&lt;li&gt;Dozens of model architectures with 1M+ pretrained checkpoints across all modalities.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Choose the right framework for every part of a models lifetime:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Train state-of-the-art models in 3 lines of code.&lt;/li&gt;
&lt;li&gt;Move a single model between PyTorch/JAX/TF2.0 frameworks at will.&lt;/li&gt;
&lt;li&gt;Pick the right framework for training, evaluation, and production.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Easily customize a model or an example to your needs:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;We provide examples for each architecture to reproduce the results published by its original authors.&lt;/li&gt;
&lt;li&gt;Model internals are exposed as consistently as possible.&lt;/li&gt;
&lt;li&gt;Model files can be used independently of the library for quick experiments.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ol&gt;
&lt;a target=&#34;_blank&#34; href=&#34;https://huggingface.co/enterprise&#34;&gt;
    &lt;img alt=&#34;Hugging Face Enterprise Hub&#34; src=&#34;https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925&#34;&gt;
&lt;/a&gt;&lt;br&gt;
&lt;h2 id=&#34;why-shouldnt-i-use-transformers&#34;&gt;Why shouldn&amp;rsquo;t I use Transformers?
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.&lt;/li&gt;
&lt;li&gt;The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/docs/accelerate&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Accelerate&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;The &lt;a class=&#34;link&#34; href=&#34;https://github.com/huggingface/transformers/tree/main/examples&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;example scripts&lt;/a&gt; are only &lt;em&gt;examples&lt;/em&gt;. They may not necessarily work out-of-the-box on your specific use case and you&amp;rsquo;ll need to adapt the code for it to work.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;100-projects-using-transformers&#34;&gt;100 projects using Transformers
&lt;/h2&gt;&lt;p&gt;Transformers is more than a toolkit to use pretrained models, it&amp;rsquo;s a community of projects built around it and the
Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone
else to build their dream projects.&lt;/p&gt;
&lt;p&gt;In order to celebrate Transformers 100,000 stars, we wanted to put the spotlight on the
community with the &lt;a class=&#34;link&#34; href=&#34;./awesome-transformers.md&#34; &gt;awesome-transformers&lt;/a&gt; page which lists 100
incredible projects built with Transformers.&lt;/p&gt;
&lt;p&gt;If you own or use a project that you believe should be part of the list, please open a PR to add it!&lt;/p&gt;
&lt;h2 id=&#34;example-models&#34;&gt;Example models
&lt;/h2&gt;&lt;p&gt;You can test most of our models directly on their &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hub model pages&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Expand each modality below to see a few example models for various use cases.&lt;/p&gt;
&lt;details&gt;
&lt;summary&gt;Audio&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Audio classification with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openai/whisper-large-v3-turbo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Whisper&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Automatic speech recognition with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/UsefulSensors/moonshine&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Moonshine&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Keyword spotting with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/superb/wav2vec2-base-superb-ks&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Wav2Vec2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Speech to speech generation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/kyutai/moshiko-pytorch-bf16&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Moshi&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Text to audio with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/facebook/musicgen-large&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MusicGen&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Text to speech with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/suno/bark&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Bark&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Computer vision&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Automatic mask generation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/facebook/sam-vit-base&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SAM&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Depth estimation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/apple/DepthPro-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DepthPro&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Image classification with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/facebook/dinov2-base&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DINO v2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Keypoint detection with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/magic-leap-community/superpoint&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SuperPoint&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Keypoint matching with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/magic-leap-community/superglue_outdoor&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SuperGlue&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Object detection with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/PekingU/rtdetr_v2_r50vd&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RT-DETRv2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Pose Estimation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/usyd-community/vitpose-base-simple&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;VitPose&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Universal segmentation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/shi-labs/oneformer_ade20k_swin_large&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OneFormer&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Video classification with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/MCG-NJU/videomae-large&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;VideoMAE&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Multimodal&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Audio or text to text with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen2-Audio-7B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2-Audio&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Document question answering with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft/layoutlmv3-base&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LayoutLMv3&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Image or text to text with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen-VL&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Image captioning &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Salesforce/blip2-opt-2.7b&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BLIP-2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;OCR-based document understanding with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GOT-OCR2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Table question answering with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/google/tapas-base&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TAPAS&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Unified multimodal understanding and generation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/BAAI/Emu3-Gen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Emu3&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Vision to text with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llava-OneVision&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Visual question answering with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/llava-hf/llava-1.5-7b-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llava&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Visual referring expression segmentation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft/kosmos-2-patch14-224&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Kosmos-2&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;NLP&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Masked word completion with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/answerdotai/ModernBERT-base&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ModernBERT&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Named entity recognition with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/google/gemma-2-2b&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gemma&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Question answering with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai/Mixtral-8x7B-v0.1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mixtral&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Summarization with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/facebook/bart-large-cnn&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BART&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Translation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/google-t5/t5-base&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;T5&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Text generation with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/meta-llama/Llama-3.2-1B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Text classification with &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen2.5-0.5B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;h2 id=&#34;citation&#34;&gt;Citation
&lt;/h2&gt;&lt;p&gt;We now have a &lt;a class=&#34;link&#34; href=&#34;https://www.aclweb.org/anthology/2020.emnlp-demos.6/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;paper&lt;/a&gt; you can cite for the 🤗 Transformers library:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bibtex&#34; data-lang=&#34;bibtex&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@inproceedings&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;wolf-etal-2020-transformers&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;Transformers: State-of-the-Art Natural Language Processing&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;booktitle&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;month&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nv&#34;&gt;oct&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;2020&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;address&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;Online&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;publisher&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;Association for Computational Linguistics&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;url&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;https://www.aclweb.org/anthology/2020.emnlp-demos.6&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;pages&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;38--45&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;</description>
        </item>
        <item>
        <title>kotaemon</title>
        <link>https://producthunt.programnotes.cn/en/p/kotaemon/</link>
        <pubDate>Wed, 10 Sep 2025 15:28:12 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/kotaemon/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1676309973406-1153db3131ee?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTc0ODkyMzB8&amp;ixlib=rb-4.1.0" alt="Featured image of post kotaemon" /&gt;&lt;h1 id=&#34;cinnamonkotaemon&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Cinnamon/kotaemon&lt;/a&gt;
&lt;/h1&gt;&lt;div align=&#34;center&#34;&gt;
&lt;h1 id=&#34;kotaemon&#34;&gt;kotaemon
&lt;/h1&gt;&lt;p&gt;An open-source clean &amp;amp; customizable RAG UI for chatting with your documents. Built with both end users and
developers in mind.&lt;/p&gt;
&lt;p&gt;&lt;img src=&#34;https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview-graph.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Preview&#34;
	
	
&gt;&lt;/p&gt;
&lt;p&gt;&lt;a href=&#34;https://trendshift.io/repositories/11607&#34; target=&#34;_blank&#34;&gt;&lt;img src=&#34;https://trendshift.io/api/badge/repositories/11607&#34; alt=&#34;Cinnamon%2Fkotaemon | Trendshift&#34; style=&#34;width: 250px; height: 55px;&#34; width=&#34;250&#34; height=&#34;55&#34;/&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/cin-model/kotaemon&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Live Demo #1&lt;/a&gt; |
&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/cin-model/kotaemon-demo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Live Demo #2&lt;/a&gt; |
&lt;a class=&#34;link&#34; href=&#34;https://cinnamon.github.io/kotaemon/online_install/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Online Install&lt;/a&gt; |
&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/drive/1eTfieec_UOowNizTJA1NjawBJH9y_1nn&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Colab Notebook (Local RAG)&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://cinnamon.github.io/kotaemon/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;User Guide&lt;/a&gt; |
&lt;a class=&#34;link&#34; href=&#34;https://cinnamon.github.io/kotaemon/development/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Developer Guide&lt;/a&gt; |
&lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon/issues&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Feedback&lt;/a&gt; |
&lt;a class=&#34;link&#34; href=&#34;mailto:kotaemon.support@cinnamon.is&#34; &gt;Contact&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.python.org/downloads/release/python-31013/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/python-3.10&amp;#43;-blue.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Python 3.10&amp;#43;&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/psf/black&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/code%20style-black-000000.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Code style: black&#34;
	
	
&gt;&lt;/a&gt;
&lt;a href=&#34;https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon&#34; target=&#34;_blank&#34;&gt;
&lt;img src=&#34;https://img.shields.io/badge/docker_pull-kotaemon:latest-brightgreen&#34; alt=&#34;docker pull ghcr.io/cinnamon/kotaemon:latest&#34;&gt;&lt;/a&gt;
&lt;img src=&#34;https://img.shields.io/github/downloads/Cinnamon/kotaemon/total.svg?label=downloads&amp;amp;color=blue&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;download&#34;
	
	
&gt;
&lt;a href=&#39;https://huggingface.co/spaces/cin-model/kotaemon-demo&#39;&gt;&lt;img src=&#39;https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue&#39;&gt;&lt;/a&gt;
&lt;a href=&#34;https://hellogithub.com/en/repository/d3141471a0244d5798bc654982b263eb&#34; target=&#34;_blank&#34;&gt;&lt;img src=&#34;https://abroad.hellogithub.com/v1/widgets/recommend.svg?rid=d3141471a0244d5798bc654982b263eb&amp;claim_uid=RLiD9UZ1rEHNaMf&amp;theme=small&#34; alt=&#34;Featured｜HelloGitHub&#34; /&gt;&lt;/a&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;!-- start-intro --&gt;
&lt;h2 id=&#34;introduction&#34;&gt;Introduction
&lt;/h2&gt;&lt;p&gt;This project serves as a functional RAG UI for both end users who want to do QA on their
documents and developers who want to build their own RAG pipeline.
&lt;br&gt;&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-yml&#34; data-lang=&#34;yml&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;+----------------------------------------------------------------------------+&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;| End users&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;l&#34;&gt;Those who use apps built with `kotaemon`.                       |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;| (You use an app like the one in the demo above)                            |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;|     +----------------------------------------------------------------+     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;|     | Developers&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;l&#34;&gt;Those who built with `kotaemon`.                   |     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;|     | (You have `import kotaemon` somewhere in your project)         |     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;|     |     +----------------------------------------------------+     |     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;|     |     | Contributors&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;l&#34;&gt;Those who make `kotaemon` better.    |     |     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;|     |     | (You make PR to this repo)                         |     |     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;|     |     +----------------------------------------------------+     |     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;|     +----------------------------------------------------------------+     |&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;l&#34;&gt;+----------------------------------------------------------------------------+&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;for-end-users&#34;&gt;For end users
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Clean &amp;amp; Minimalistic UI&lt;/strong&gt;: A user-friendly interface for RAG-based QA.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Support for Various LLMs&lt;/strong&gt;: Compatible with LLM API providers (OpenAI, AzureOpenAI, Cohere, etc.) and local LLMs (via &lt;code&gt;ollama&lt;/code&gt; and &lt;code&gt;llama-cpp-python&lt;/code&gt;).&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Easy Installation&lt;/strong&gt;: Simple scripts to get you started quickly.&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;for-developers&#34;&gt;For developers
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Framework for RAG Pipelines&lt;/strong&gt;: Tools to build your own RAG-based document QA pipeline.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Customizable UI&lt;/strong&gt;: See your RAG pipeline in action with the provided UI, built with &lt;a href=&#39;https://github.com/gradio-app/gradio&#39;&gt;Gradio &lt;img src=&#39;https://img.shields.io/github/stars/gradio-app/gradio&#39;&gt;&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Gradio Theme&lt;/strong&gt;: If you use Gradio for development, check out our theme here: &lt;a class=&#34;link&#34; href=&#34;https://github.com/lone17/kotaemon-gradio-theme&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;kotaemon-gradio-theme&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;key-features&#34;&gt;Key Features
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Host your own document QA (RAG) web-UI&lt;/strong&gt;: Support multi-user login, organize your files in private/public collections, collaborate and share your favorite chat with others.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Organize your LLM &amp;amp; Embedding models&lt;/strong&gt;: Support both local LLMs &amp;amp; popular API providers (OpenAI, Azure, Ollama, Groq).&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Hybrid RAG pipeline&lt;/strong&gt;: Sane default RAG pipeline with hybrid (full-text &amp;amp; vector) retriever and re-ranking to ensure best retrieval quality.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Multi-modal QA support&lt;/strong&gt;: Perform Question Answering on multiple documents with figures and tables support. Support multi-modal document parsing (selectable options on UI).&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Advanced citations with document preview&lt;/strong&gt;: By default the system will provide detailed citations to ensure the correctness of LLM answers. View your citations (incl. relevant score) directly in the &lt;em&gt;in-browser PDF viewer&lt;/em&gt; with highlights. Warning when retrieval pipeline return low relevant articles.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Support complex reasoning methods&lt;/strong&gt;: Use question decomposition to answer your complex/multi-hop question. Support agent-based reasoning with &lt;code&gt;ReAct&lt;/code&gt;, &lt;code&gt;ReWOO&lt;/code&gt; and other agents.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Configurable settings UI&lt;/strong&gt;: You can adjust most important aspects of retrieval &amp;amp; generation process on the UI (incl. prompts).&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Extensible&lt;/strong&gt;: Being built on Gradio, you are free to customize or add any UI elements as you like. Also, we aim to support multiple strategies for document indexing &amp;amp; retrieval. &lt;code&gt;GraphRAG&lt;/code&gt; indexing pipeline is provided as an example.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;img src=&#34;https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/preview.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Preview&#34;
	
	
&gt;&lt;/p&gt;
&lt;h2 id=&#34;installation&#34;&gt;Installation
&lt;/h2&gt;&lt;blockquote&gt;
&lt;p&gt;If you are not a developer and just want to use the app, please check out our easy-to-follow &lt;a class=&#34;link&#34; href=&#34;https://cinnamon.github.io/kotaemon/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;User Guide&lt;/a&gt;. Download the &lt;code&gt;.zip&lt;/code&gt; file from the &lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon/releases/latest&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;latest release&lt;/a&gt; to get all the newest features and bug fixes.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3 id=&#34;system-requirements&#34;&gt;System requirements
&lt;/h3&gt;&lt;ol&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.python.org/downloads/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Python&lt;/a&gt; &amp;gt;= 3.10&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.docker.com/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docker&lt;/a&gt;: optional, if you &lt;a class=&#34;link&#34; href=&#34;#with-docker-recommended&#34; &gt;install with Docker&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.unstructured.io/open-source/installation/full-installation#full-installation&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Unstructured&lt;/a&gt; if you want to process files other than &lt;code&gt;.pdf&lt;/code&gt;, &lt;code&gt;.html&lt;/code&gt;, &lt;code&gt;.mhtml&lt;/code&gt;, and &lt;code&gt;.xlsx&lt;/code&gt; documents. Installation steps differ depending on your operating system. Please visit the link and follow the specific instructions provided there.&lt;/li&gt;
&lt;/ol&gt;
&lt;h3 id=&#34;with-docker-recommended&#34;&gt;With Docker (recommended)
&lt;/h3&gt;&lt;ol&gt;
&lt;li&gt;
&lt;p&gt;We support both &lt;code&gt;lite&lt;/code&gt; &amp;amp; &lt;code&gt;full&lt;/code&gt; version of Docker images. With &lt;code&gt;full&lt;/code&gt; version, the extra packages of &lt;code&gt;unstructured&lt;/code&gt; will be installed, which can support additional file types (&lt;code&gt;.doc&lt;/code&gt;, &lt;code&gt;.docx&lt;/code&gt;, &amp;hellip;) but the cost is larger docker image size. For most users, the &lt;code&gt;lite&lt;/code&gt; image should work well in most cases.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;To use the &lt;code&gt;full&lt;/code&gt; version.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-e &lt;span class=&#34;nv&#34;&gt;GRADIO_SERVER_NAME&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;0.0.0.0 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-e &lt;span class=&#34;nv&#34;&gt;GRADIO_SERVER_PORT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;7860&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-v ./ktem_app_data:/app/ktem_app_data &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-p 7860:7860 -it --rm &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;ghcr.io/cinnamon/kotaemon:main-full
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;To use the &lt;code&gt;full&lt;/code&gt; version with bundled &lt;strong&gt;Ollama&lt;/strong&gt; for &lt;em&gt;local / private RAG&lt;/em&gt;.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# change image name to&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run &amp;lt;...&amp;gt; ghcr.io/cinnamon/kotaemon:main-ollama
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;To use the &lt;code&gt;lite&lt;/code&gt; version.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt; &lt;span class=&#34;c1&#34;&gt;# change image name to&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt; docker run &amp;lt;...&amp;gt; ghcr.io/cinnamon/kotaemon:main-lite
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;We currently support and test two platforms: &lt;code&gt;linux/amd64&lt;/code&gt; and &lt;code&gt;linux/arm64&lt;/code&gt; (for newer Mac). You can specify the platform by passing &lt;code&gt;--platform&lt;/code&gt; in the &lt;code&gt;docker run&lt;/code&gt; command. For example:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# To run docker with platform linux/arm64&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-e &lt;span class=&#34;nv&#34;&gt;GRADIO_SERVER_NAME&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;0.0.0.0 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-e &lt;span class=&#34;nv&#34;&gt;GRADIO_SERVER_PORT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;7860&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-v ./ktem_app_data:/app/ktem_app_data &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;-p 7860:7860 -it --rm &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;--platform linux/arm64 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;ghcr.io/cinnamon/kotaemon:main-lite
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Once everything is set up correctly, you can go to &lt;code&gt;http://localhost:7860/&lt;/code&gt; to access the WebUI.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;We use &lt;a class=&#34;link&#34; href=&#34;https://docs.github.com/en/packages/working-with-a-github-packages-registry/working-with-the-container-registry&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GHCR&lt;/a&gt; to store docker images, all images can be found &lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon/pkgs/container/kotaemon&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here.&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;/ol&gt;
&lt;h3 id=&#34;without-docker&#34;&gt;Without Docker
&lt;/h3&gt;&lt;ol&gt;
&lt;li&gt;
&lt;p&gt;Clone and install required packages on a fresh python environment.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# optional (setup env)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;conda create -n kotaemon &lt;span class=&#34;nv&#34;&gt;python&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;3.10
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;conda activate kotaemon
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# clone this repo&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/Cinnamon/kotaemon
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; kotaemon
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -e &lt;span class=&#34;s2&#34;&gt;&amp;#34;libs/kotaemon[all]&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -e &lt;span class=&#34;s2&#34;&gt;&amp;#34;libs/ktem&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Create a &lt;code&gt;.env&lt;/code&gt; file in the root of this project. Use &lt;code&gt;.env.example&lt;/code&gt; as a template&lt;/p&gt;
&lt;p&gt;The &lt;code&gt;.env&lt;/code&gt; file is there to serve use cases where users want to pre-config the models before starting up the app (e.g. deploy the app on HF hub). The file will only be used to populate the db once upon the first run, it will no longer be used in consequent runs.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;(Optional) To enable in-browser &lt;code&gt;PDF_JS&lt;/code&gt; viewer, download &lt;a class=&#34;link&#34; href=&#34;https://github.com/mozilla/pdf.js/releases/download/v4.0.379/pdfjs-4.0.379-dist.zip&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PDF_JS_DIST&lt;/a&gt; then extract it to &lt;code&gt;libs/ktem/ktem/assets/prebuilt&lt;/code&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;/ol&gt;
&lt;img src=&#34;https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/pdf-viewer-setup.png&#34; alt=&#34;pdf-setup&#34; width=&#34;300&#34;&gt;
&lt;ol start=&#34;4&#34;&gt;
&lt;li&gt;
&lt;p&gt;Start the web server:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;python app.py
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ul&gt;
&lt;li&gt;The app will be automatically launched in your browser.&lt;/li&gt;
&lt;li&gt;Default username and password are both &lt;code&gt;admin&lt;/code&gt;. You can set up additional users directly through the UI.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;img src=&#34;https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/chat-tab.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Chat tab&#34;
	
	
&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Check the &lt;code&gt;Resources&lt;/code&gt; tab and &lt;code&gt;LLMs and Embeddings&lt;/code&gt; and ensure that your &lt;code&gt;api_key&lt;/code&gt; value is set correctly from your &lt;code&gt;.env&lt;/code&gt; file. If it is not set, you can set it there.&lt;/p&gt;
&lt;/li&gt;
&lt;/ol&gt;
&lt;h3 id=&#34;setup-graphrag&#34;&gt;Setup GraphRAG
&lt;/h3&gt;&lt;blockquote&gt;
&lt;p&gt;[!NOTE]
Official MS GraphRAG indexing only works with OpenAI or Ollama API.
We recommend most users to use NanoGraphRAG implementation for straightforward integration with Kotaemon.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;details&gt;
&lt;summary&gt;Setup Nano GRAPHRAG&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Install nano-GraphRAG: &lt;code&gt;pip install nano-graphrag&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;&lt;code&gt;nano-graphrag&lt;/code&gt; install might introduce version conflicts, see &lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon/issues/440&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this issue&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;To quickly fix: &lt;code&gt;pip uninstall hnswlib chroma-hnswlib &amp;amp;&amp;amp; pip install chroma-hnswlib&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;Launch Kotaemon with &lt;code&gt;USE_NANO_GRAPHRAG=true&lt;/code&gt; environment variable.&lt;/li&gt;
&lt;li&gt;Set your default LLM &amp;amp; Embedding models in Resources setting and it will be recognized automatically from NanoGraphRAG.&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Setup LIGHTRAG&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Install LightRAG: &lt;code&gt;pip install git+https://github.com/HKUDS/LightRAG.git&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;&lt;code&gt;LightRAG&lt;/code&gt; install might introduce version conflicts, see &lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon/issues/440&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this issue&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;To quickly fix: &lt;code&gt;pip uninstall hnswlib chroma-hnswlib &amp;amp;&amp;amp; pip install chroma-hnswlib&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;Launch Kotaemon with &lt;code&gt;USE_LIGHTRAG=true&lt;/code&gt; environment variable.&lt;/li&gt;
&lt;li&gt;Set your default LLM &amp;amp; Embedding models in Resources setting and it will be recognized automatically from LightRAG.&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Setup MS GRAPHRAG&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Non-Docker Installation&lt;/strong&gt;: If you are not using Docker, install GraphRAG with the following command:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install &lt;span class=&#34;s2&#34;&gt;&amp;#34;graphrag&amp;lt;=0.3.6&amp;#34;&lt;/span&gt; future
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Setting Up API KEY&lt;/strong&gt;: To use the GraphRAG retriever feature, ensure you set the &lt;code&gt;GRAPHRAG_API_KEY&lt;/code&gt; environment variable. You can do this directly in your environment or by adding it to a &lt;code&gt;.env&lt;/code&gt; file.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Using Local Models and Custom Settings&lt;/strong&gt;: If you want to use GraphRAG with local models (like &lt;code&gt;Ollama&lt;/code&gt;) or customize the default LLM and other configurations, set the &lt;code&gt;USE_CUSTOMIZED_GRAPHRAG_SETTING&lt;/code&gt; environment variable to true. Then, adjust your settings in the &lt;code&gt;settings.yaml.example&lt;/code&gt; file.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;h3 id=&#34;setup-local-models-for-localprivate-rag&#34;&gt;Setup Local Models (for local/private RAG)
&lt;/h3&gt;&lt;p&gt;See &lt;a class=&#34;link&#34; href=&#34;docs/local_model.md&#34; &gt;Local model setup&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;setup-multimodal-document-parsing-ocr-table-parsing-figure-extraction&#34;&gt;Setup multimodal document parsing (OCR, table parsing, figure extraction)
&lt;/h3&gt;&lt;p&gt;These options are available:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://azure.microsoft.com/en-us/products/ai-services/ai-document-intelligence&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Azure Document Intelligence (API)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://developer.adobe.com/document-services/docs/overview/pdf-extract-api/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Adobe PDF Extract (API)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/DS4SD/docling&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docling (local, open-source)&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;To use Docling, first install required dependencies: &lt;code&gt;pip install docling&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Select corresponding loaders in &lt;code&gt;Settings -&amp;gt; Retrieval Settings -&amp;gt; File loader&lt;/code&gt;&lt;/p&gt;
&lt;h3 id=&#34;customize-your-application&#34;&gt;Customize your application
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;By default, all application data is stored in the &lt;code&gt;./ktem_app_data&lt;/code&gt; folder. You can back up or copy this folder to transfer your installation to a new machine.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;For advanced users or specific use cases, you can customize these files:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;code&gt;flowsettings.py&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;&lt;code&gt;.env&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h4 id=&#34;flowsettingspy&#34;&gt;&lt;code&gt;flowsettings.py&lt;/code&gt;
&lt;/h4&gt;&lt;p&gt;This file contains the configuration of your application. You can use the example
&lt;a class=&#34;link&#34; href=&#34;flowsettings.py&#34; &gt;here&lt;/a&gt; as the starting point.&lt;/p&gt;
&lt;details&gt;
&lt;summary&gt;Notable settings&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# setup your preferred document store (with full-text search capabilities)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;KH_DOCSTORE&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;Elasticsearch&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;|&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;LanceDB&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;|&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;SimpleFileDocumentStore&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# setup your preferred vectorstore (for vector-based search)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;KH_VECTORSTORE&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ChromaDB&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;|&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;LanceDB&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;|&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;InMemory&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;|&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Milvus&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;|&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Qdrant&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Enable / disable multimodal QA&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;KH_REASONINGS_USE_MULTIMODAL&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Setup your new reasoning pipeline or modify existing one.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;KH_REASONINGS&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;s2&#34;&gt;&amp;#34;ktem.reasoning.simple.FullQAPipeline&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;s2&#34;&gt;&amp;#34;ktem.reasoning.simple.FullDecomposeQAPipeline&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;s2&#34;&gt;&amp;#34;ktem.reasoning.react.ReactAgentPipeline&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;s2&#34;&gt;&amp;#34;ktem.reasoning.rewoo.RewooAgentPipeline&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h4 id=&#34;env&#34;&gt;&lt;code&gt;.env&lt;/code&gt;
&lt;/h4&gt;&lt;p&gt;This file provides another way to configure your models and credentials.&lt;/p&gt;
&lt;details&gt;
&lt;summary&gt;Configure model via the .env file&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;Alternatively, you can configure the models via the &lt;code&gt;.env&lt;/code&gt; file with the information needed to connect to the LLMs. This file is located in the folder of the application. If you don&amp;rsquo;t see it, you can create one.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Currently, the following providers are supported:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;OpenAI&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;In the &lt;code&gt;.env&lt;/code&gt; file, set the &lt;code&gt;OPENAI_API_KEY&lt;/code&gt; variable with your OpenAI API key in order
to enable access to OpenAI&amp;rsquo;s models. There are other variables that can be modified,
please feel free to edit them to fit your case. Otherwise, the default parameter should
work for most people.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;OPENAI_API_BASE&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;https://api.openai.com/v1
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;OPENAI_API_KEY&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&amp;lt;your OpenAI API key here&amp;gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;OPENAI_CHAT_MODEL&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;gpt-3.5-turbo
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;OPENAI_EMBEDDINGS_MODEL&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;text-embedding-ada-002
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Azure OpenAI&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;For OpenAI models via Azure platform, you need to provide your Azure endpoint and API
key. Your might also need to provide your developments&amp;rsquo; name for the chat model and the
embedding model depending on how you set up Azure development.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;AZURE_OPENAI_ENDPOINT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;AZURE_OPENAI_API_KEY&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;OPENAI_API_VERSION&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;2024-02-15-preview
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;AZURE_OPENAI_CHAT_DEPLOYMENT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;gpt-35-turbo
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;text-embedding-ada-002
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Local Models&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;Using &lt;code&gt;ollama&lt;/code&gt; OpenAI compatible server:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;Install &lt;a class=&#34;link&#34; href=&#34;https://github.com/ollama/ollama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ollama&lt;/a&gt; and start the application.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Pull your model, for example:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;ollama pull llama3.1:8b
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;ollama pull nomic-embed-text
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Set the model names on web UI and make it as default:&lt;/p&gt;
&lt;p&gt;&lt;img src=&#34;https://raw.githubusercontent.com/Cinnamon/kotaemon/main/docs/images/models.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Models&#34;
	
	
&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;Using &lt;code&gt;GGUF&lt;/code&gt; with &lt;code&gt;llama-cpp-python&lt;/code&gt;&lt;/p&gt;
&lt;p&gt;You can search and download a LLM to be ran locally from the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hugging Face Hub&lt;/a&gt;. Currently, these model formats are supported:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;GGUF&lt;/p&gt;
&lt;p&gt;You should choose a model whose size is less than your device&amp;rsquo;s memory and should leave
about 2 GB. For example, if you have 16 GB of RAM in total, of which 12 GB is available,
then you should choose a model that takes up at most 10 GB of RAM. Bigger models tend to
give better generation but also take more processing time.&lt;/p&gt;
&lt;p&gt;Here are some recommendations and their size in memory:&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen1.5-1.8B-Chat-GGUF/resolve/main/qwen1_5-1_8b-chat-q8_0.gguf?download=true&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen1.5-1.8B-Chat-GGUF&lt;/a&gt;: around 2 GB&lt;/p&gt;
&lt;p&gt;Add a new LlamaCpp model with the provided model name on the web UI.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;adding-your-own-rag-pipeline&#34;&gt;Adding your own RAG pipeline
&lt;/h3&gt;&lt;h4 id=&#34;custom-reasoning-pipeline&#34;&gt;Custom Reasoning Pipeline
&lt;/h4&gt;&lt;ol&gt;
&lt;li&gt;Check the default pipeline implementation in &lt;a class=&#34;link&#34; href=&#34;libs/ktem/ktem/reasoning/simple.py&#34; &gt;here&lt;/a&gt;. You can make quick adjustment to how the default QA pipeline work.&lt;/li&gt;
&lt;li&gt;Add new &lt;code&gt;.py&lt;/code&gt; implementation in &lt;code&gt;libs/ktem/ktem/reasoning/&lt;/code&gt; and later include it in &lt;code&gt;flowssettings&lt;/code&gt; to enable it on the UI.&lt;/li&gt;
&lt;/ol&gt;
&lt;h4 id=&#34;custom-indexing-pipeline&#34;&gt;Custom Indexing Pipeline
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;Check sample implementation in &lt;code&gt;libs/ktem/ktem/index/file/graph&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;(more instruction WIP).&lt;/p&gt;
&lt;/blockquote&gt;
&lt;!-- end-intro --&gt;
&lt;h2 id=&#34;citation&#34;&gt;Citation
&lt;/h2&gt;&lt;p&gt;Please cite this project as&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-BibTeX&#34; data-lang=&#34;BibTeX&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@misc&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;kotaemon2024&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;{Kotaemon - An open-source RAG-based tool for chatting with any content.}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;{The Kotaemon Team}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;{2024}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;na&#34;&gt;howpublished&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;{\url{https://github.com/Cinnamon/kotaemon}}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;star-history&#34;&gt;Star History
&lt;/h2&gt;&lt;a href=&#34;https://star-history.com/#Cinnamon/kotaemon&amp;Date&#34;&gt;
 &lt;picture&gt;
   &lt;source media=&#34;(prefers-color-scheme: dark)&#34; srcset=&#34;https://api.star-history.com/svg?repos=Cinnamon/kotaemon&amp;type=Date&amp;theme=dark&#34; /&gt;
   &lt;source media=&#34;(prefers-color-scheme: light)&#34; srcset=&#34;https://api.star-history.com/svg?repos=Cinnamon/kotaemon&amp;type=Date&#34; /&gt;
   &lt;img alt=&#34;Star History Chart&#34; src=&#34;https://api.star-history.com/svg?repos=Cinnamon/kotaemon&amp;type=Date&#34; /&gt;
 &lt;/picture&gt;
&lt;/a&gt;
&lt;h2 id=&#34;contribution&#34;&gt;Contribution
&lt;/h2&gt;&lt;p&gt;Since our project is actively being developed, we greatly value your feedback and contributions. Please see our &lt;a class=&#34;link&#34; href=&#34;https://github.com/Cinnamon/kotaemon/blob/main/CONTRIBUTING.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Contributing Guide&lt;/a&gt; to get started. Thank you to all our contributors!&lt;/p&gt;
&lt;a href=&#34;https://github.com/Cinnamon/kotaemon/graphs/contributors&#34;&gt;
  &lt;img src=&#34;https://contrib.rocks/image?repo=Cinnamon/kotaemon&#34; /&gt;
&lt;/a&gt;
</description>
        </item>
        <item>
        <title>MiniCPM-V</title>
        <link>https://producthunt.programnotes.cn/en/p/minicpm-v/</link>
        <pubDate>Tue, 02 Sep 2025 15:29:41 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/minicpm-v/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1638382620941-f5c0628d21bd?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTY3OTgwOTd8&amp;ixlib=rb-4.1.0" alt="Featured image of post MiniCPM-V" /&gt;&lt;h1 id=&#34;openbmbminicpm-v&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM-V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenBMB/MiniCPM-V&lt;/a&gt;
&lt;/h1&gt;&lt;div align=&#34;center&#34;&gt;
&lt;p&gt;&lt;img src=&#34;./assets/minicpm_v_and_minicpm_o_title.png&#34; width=&#34;500em&#34; &gt;&lt;/img&gt;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;A GPT-4o Level MLLM for Single Image, Multi Image and High-FPS Video Understanding on Your Phone&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;./README_zh.md&#34; &gt;中文&lt;/a&gt; |
English&lt;/strong&gt;&lt;/p&gt;
&lt;span style=&#34;display: inline-flex; align-items: center; margin-right: 2px;&#34;&gt;
  &lt;img src=&#34;./assets/wechat.png&#34; alt=&#34;WeChat&#34; style=&#34;margin-right: 4px;&#34;&gt;
  &lt;a href=&#34;docs/wechat.md&#34; target=&#34;_blank&#34;&gt; WeChat&lt;/a&gt; &amp;nbsp;|
&lt;/span&gt;
&amp;nbsp;
&lt;span style=&#34;display: inline-flex; align-items: center; margin-left: -8px;&#34;&gt;
&lt;img src=&#34;./assets/discord.png&#34; alt=&#34;Discord&#34; style=&#34;margin-right: 4px;&#34;&gt;
  &lt;a href=&#34;https://discord.gg/rftuRMbqzf&#34; target=&#34;_blank&#34;&gt; Discord&lt;/a&gt; &amp;nbsp;
&lt;/span&gt;
&lt;p align=&#34;center&#34;&gt;
   MiniCPM-V 4.5 &lt;a href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5&#34;&gt;🤗&lt;/a&gt; &lt;a href=&#34;http://101.126.42.235:30910/&#34;&gt;🤖&lt;/a&gt; | MiniCPM-o 2.6 &lt;a href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6&#34;&gt;🤗&lt;/a&gt;  &lt;a href=&#34;https://minicpm-omni-webdemo-us.modelbest.cn/&#34;&gt; 🤖&lt;/a&gt; | &lt;a href=&#34;https://github.com/OpenSQZ/MiniCPM-V-Cookbook&#34;&gt;🍳 Cookbook&lt;/a&gt; | 
  📄 Technical Report (Coming Soon)
&lt;/p&gt;
&lt;/div&gt;
&lt;p&gt;&lt;strong&gt;MiniCPM-V&lt;/strong&gt; is a series of efficient end-side multimodal LLMs (MLLMs), which accept images, videos and text as inputs and deliver high-quality text outputs. &lt;strong&gt;MiniCPM-o&lt;/strong&gt; additionally takes audio as inputs and provides high-quality speech outputs in an end-to-end fashion. Since February 2024, we have released 7 versions of the model, aiming to achieve &lt;strong&gt;strong performance and efficient deployment&lt;/strong&gt;. The most notable models in the series currently include:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;MiniCPM-V 4.5&lt;/strong&gt;: 🔥🔥🔥 The latest and most capable model in the MiniCPM-V series. With a total of 8B parameters, this model &lt;strong&gt;outperforms GPT-4o-latest, Gemini-2.0 Pro, and Qwen2.5-VL 72B&lt;/strong&gt; in vision-language capabilities, making it the most performant on-device multimodal model in the open-source community. This version brings &lt;strong&gt;new features including efficient high-FPS and long video understanding (up to 96x compression rate for video tokens), controllable hybrid fast/deep thinking, strong handwritten OCR and complex table/document parsing&lt;/strong&gt;. It also advances MiniCPM-V&amp;rsquo;s popular features such as trustworthy behavior, multilingual support and end-side deployability.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;MiniCPM-o 2.6&lt;/strong&gt;: ⭐️⭐️⭐️ The most capable model in the MiniCPM-o series. With a total of 8B parameters, this end-to-end model &lt;strong&gt;achieves comparable performance to GPT-4o-202405 in vision, speech, and multimodal live streaming&lt;/strong&gt;, making it one of the most versatile and performant models in the open-source community. For the new voice mode, MiniCPM-o 2.6 &lt;strong&gt;supports bilingual real-time speech conversation with configurable voices&lt;/strong&gt;, and also allows for fun capabilities such as emotion/speed/style control, end-to-end voice cloning, role play, etc. Due to its superior token density, MiniCPM-o 2.6 can for the first time &lt;strong&gt;support multimodal live streaming on end-side devices&lt;/strong&gt; such as iPad.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;news&#34;&gt;News &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;h4 id=&#34;-pinned&#34;&gt;📌 Pinned
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;[2025.09.01] ⭐️⭐️⭐️ MiniCPM-V 4.5 has been officially supported by &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/15575&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama.cpp&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/vllm-project/vllm/pull/23586&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;vLLM&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/pull/9022&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA-Factory&lt;/a&gt;. You are welcome to use it directly through these official channels! Support for additional frameworks such as &lt;a class=&#34;link&#34; href=&#34;https://github.com/ollama/ollama/pull/12078&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/sgl-project/sglang/pull/9610&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SGLang&lt;/a&gt; is actively in progress.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.08.26] 🔥🔥🔥 We open-source MiniCPM-V 4.5, which outperforms GPT-4o-latest, Gemini-2.0 Pro, and Qwen2.5-VL 72B. It advances popular capabilities of MiniCPM-V, and brings useful new features. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.08.01] ⭐️⭐️⭐️ We open-sourced the &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V &amp;amp; o Cookbook&lt;/a&gt;! It provides comprehensive guides for diverse user scenarios, paired with our new &lt;a class=&#34;link&#34; href=&#34;https://minicpm-o.readthedocs.io/en/latest/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docs Site&lt;/a&gt; for smoother onboarding.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.06.20] ⭐️⭐️⭐️ Our official &lt;a class=&#34;link&#34; href=&#34;https://ollama.com/openbmb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama repository&lt;/a&gt; is released. Try our latest models with &lt;a class=&#34;link&#34; href=&#34;https://ollama.com/openbmb/minicpm-o2.6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;one click&lt;/a&gt;！&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.03.01] 🚀🚀🚀 RLAIF-V, the alignment technique of MiniCPM-o, is accepted by CVPR 2025 Highlights！The &lt;a class=&#34;link&#34; href=&#34;https://github.com/RLHF-V/RLAIF-V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;code&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;dataset&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.17220&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;paper&lt;/a&gt; are open-sourced!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.01.24] 📢📢📢 MiniCPM-o 2.6 technical report is released! See &lt;a class=&#34;link&#34; href=&#34;https://openbmb.notion.site/MiniCPM-o-2-6-A-GPT-4o-Level-MLLM-for-Vision-Speech-and-Multimodal-Live-Streaming-on-Your-Phone-185ede1b7a558042b5d5e45e6b237da9&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.01.19] 📢 &lt;strong&gt;ATTENTION!&lt;/strong&gt; We are currently working on merging MiniCPM-o 2.6 into the official repositories of llama.cpp, Ollama, and vllm. Until the merge is complete, please USE OUR LOCAL FORKS of &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/llama.cpp/blob/minicpm-omni/examples/llava/README-minicpmo2.6.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama.cpp&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM-o?tab=readme-ov-file#efficient-inference-with-llamacpp-ollama-vllm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;vllm&lt;/a&gt;. &lt;strong&gt;Using the official repositories before the merge may lead to unexpected issues&lt;/strong&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.01.19] ⭐️⭐️⭐️ MiniCPM-o tops GitHub Trending and reaches top-2 on Hugging Face Trending!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.01.17] We have updated the usage of MiniCPM-o 2.6 int4 quantization version and resolved the model initialization error. Click &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt; and try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.01.13] 🔥🔥🔥 We open-source MiniCPM-o 2.6, which matches GPT-4o-202405 on vision, speech and multimodal live streaming. It advances popular capabilities of MiniCPM-V 2.6, and supports various new fun features. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.08.17] 🚀🚀🚀 MiniCPM-V 2.6 is now fully supported by &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggerganov/llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;official&lt;/a&gt; llama.cpp! GGUF models of various sizes are available &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.08.06] 🔥🔥🔥 We open-source MiniCPM-V 2.6, which outperforms GPT-4V on single image, multi-image and video understanding. It advances popular features of MiniCPM-Llama3-V 2.5, and can support real-time video understanding on iPad. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.08.03] MiniCPM-Llama3-V 2.5 technical report is released! See &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2408.01800&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.23] 🔥🔥🔥 MiniCPM-V tops GitHub Trending and Hugging Face Trending! Our demo, recommended by Hugging Face Gradio’s official account, is available &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;. Come and try it out!&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;br&gt;
&lt;details&gt; 
&lt;summary&gt;Click to view more news.&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;[2025.08.02] 🚀🚀🚀 We open-source MiniCPM-V 4.0, which outperforms GPT-4.1-mini-20250414 in image understanding. It advances popular features of MiniCPM-V 2.6, and largely improves the efficiency. We also open-source the iOS App on iPhone and iPad. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2025.01.23] 💡💡💡 MiniCPM-o 2.6 is now supported by &lt;a class=&#34;link&#34; href=&#34;https://github.com/PKU-Alignment/align-anything&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Align-Anything&lt;/a&gt;, a framework by PKU-Alignment Team for aligning any-to-any modality large models with human intentions. It supports DPO and SFT fine-tuning on both vision and audio. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.08.15] We now also support multi-image SFT. For more details, please refer to the &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;document&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.08.14] MiniCPM-V 2.6 now also supports &lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/ms-swift/issues/1613&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;fine-tuning&lt;/a&gt; with the SWIFT framework!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.08.10] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 is now fully supported by &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggerganov/llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;official&lt;/a&gt; llama.cpp! GGUF models of various sizes are available &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.07.19] MiniCPM-Llama3-V 2.5 supports vLLM now! See &lt;a class=&#34;link&#34; href=&#34;#inference-with-vllm&#34; &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.06.03] Now, you can run MiniCPM-Llama3-V 2.5 on multiple low VRAM GPUs(12 GB or 16 GB) by distributing the model&amp;rsquo;s layers across multiple GPUs. For more details, check this &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;link&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.28] 🚀🚀🚀 MiniCPM-Llama3-V 2.5 now fully supports its feature in llama.cpp and Ollama! Please pull the latest code &lt;strong&gt;of our provided forks&lt;/strong&gt; (&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/llama.cpp/blob/minicpm-v2.5/examples/minicpmv/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama.cpp&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/ollama/tree/minicpm-v2.5/examples/minicpm-v2.5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama&lt;/a&gt;). GGUF models in various sizes are available &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf/tree/main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;. MiniCPM-Llama3-V 2.5 series is &lt;strong&gt;not supported by the official repositories yet&lt;/strong&gt;, and we are working hard to merge PRs. Please stay tuned!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.28] 💫 We now support LoRA fine-tuning for MiniCPM-Llama3-V 2.5, using only 2 V100 GPUs! See more statistics &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM-V/tree/main/finetune#model-fine-tuning-memory-usage-statistics&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.25] MiniCPM-Llama3-V 2.5 now supports streaming outputs and customized system prompts. Try it &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5#usage&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.24] We release the MiniCPM-Llama3-V 2.5 &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;gguf&lt;/a&gt;, which supports &lt;a class=&#34;link&#34; href=&#34;#inference-with-llamacpp&#34; &gt;llama.cpp&lt;/a&gt; inference and provides a 6~8 token/s smooth decoding on mobile phones. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.23] 🔍 We&amp;rsquo;ve released a comprehensive comparison between Phi-3-vision-128k-instruct and MiniCPM-Llama3-V 2.5, including benchmark evaluations, multilingual capabilities, and inference efficiency 🌟📊🌍🚀. Click &lt;a class=&#34;link&#34; href=&#34;./docs/compare_with_phi-3_vision.md&#34; &gt;here&lt;/a&gt; to view more details.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.05.20] We open-soure MiniCPM-Llama3-V 2.5, it has improved OCR capability and supports 30+ languages, representing the first end-side MLLM achieving GPT-4V level performance! We provide &lt;a class=&#34;link&#34; href=&#34;#deployment-on-mobile-phone&#34; &gt;efficient inference&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;./finetune/readme.md&#34; &gt;simple fine-tuning&lt;/a&gt;. Try it now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.04.23] MiniCPM-V-2.0 supports vLLM now! Click &lt;a class=&#34;link&#34; href=&#34;#inference-with-vllm&#34; &gt;here&lt;/a&gt; to view more details.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.04.18] We create a HuggingFace Space to host the demo of MiniCPM-V 2.0 at &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/openbmb/MiniCPM-V-2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.04.17] MiniCPM-V-2.0 supports deploying &lt;a class=&#34;link&#34; href=&#34;#webui-demo&#34; &gt;WebUI Demo&lt;/a&gt; now!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.04.15] MiniCPM-V-2.0 now also supports &lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2%e6%9c%80%e4%bd%b3%e5%ae%9e%e8%b7%b5.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;fine-tuning&lt;/a&gt; with the SWIFT framework!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.04.12] We open-source MiniCPM-V 2.0, which achieves comparable performance with Gemini Pro in understanding scene text and outperforms strong Qwen-VL-Chat 9.6B and Yi-VL 34B on &lt;a href=&#34;https://rank.opencompass.org.cn/leaderboard-multimodal&#34;&gt;OpenCompass&lt;/a&gt;, a comprehensive evaluation over 11 popular benchmarks. Click &lt;a href=&#34;https://openbmb.vercel.app/minicpm-v-2&#34;&gt;here&lt;/a&gt; to view the MiniCPM-V 2.0 technical blog.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.03.14] MiniCPM-V now supports &lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v%e6%9c%80%e4%bd%b3%e5%ae%9e%e8%b7%b5.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;fine-tuning&lt;/a&gt; with the SWIFT framework. Thanks to &lt;a class=&#34;link&#34; href=&#34;https://github.com/Jintao-Huang&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Jintao&lt;/a&gt; for the contribution！&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.03.01] MiniCPM-V can now be deployed on Mac!&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;[2024.02.01] We open-source MiniCPM-V and OmniLMM-12B, which support efficient end-side deployment and powerful multimodal capabilities correspondingly.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt; 
&lt;h2 id=&#34;contents&#34;&gt;Contents &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#minicpm-v-45&#34; &gt;MiniCPM-V 4.5&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#inference-efficiency&#34; &gt;Inference Efficiency&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#minicpm-o-26&#34; &gt;MiniCPM-o 2.6&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#minicpm-v--o-cookbook&#34; &gt;MiniCPM-V &amp;amp; o Cookbook&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#chat-with-our-demo-on-gradio-&#34; &gt;Chat with Our Demo on Gradio 🤗&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#inference&#34; &gt;Inference&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#model-zoo&#34; &gt;Model Zoo&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#multi-turn-conversation&#34; &gt;Multi-turn Conversation&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#chat-with-multiple-images&#34; &gt;Chat with Multiple Images&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#in-context-few-shot-learning&#34; &gt;In-context Few-shot Learning&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#chat-with-video&#34; &gt;Chat with Video&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#speech-and-audio-mode&#34; &gt;Speech and Audio Mode&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#multimodal-live-streaming&#34; &gt;Multimodal Live Streaming&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#inference-on-multiple-gpus&#34; &gt;Inference on Multiple GPUs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#inference-on-mac&#34; &gt;Inference on Mac&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#efficient-inference-with-llamacpp-ollama-vllm&#34; &gt;Efficient Inference with llama.cpp, Ollama, vLLM&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#fine-tuning&#34; &gt;Fine-tuning&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#awesome-work-using-minicpm-v--minicpm-o&#34; &gt;Awesome work using MiniCPM-V &amp;amp; MiniCPM-o&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#faqs&#34; &gt;FAQs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#limitations&#34; &gt;Limitations&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;minicpm-v-45&#34;&gt;MiniCPM-V 4.5
&lt;/h2&gt;&lt;p&gt;&lt;strong&gt;MiniCPM-V 4.5&lt;/strong&gt; is the latest and most capable model in the MiniCPM-V series. The model is built on Qwen3-8B and SigLIP2-400M with a total of 8B parameters. It exhibits a significant performance improvement over previous MiniCPM-V and MiniCPM-o models, and introduces new useful features. Notable features of MiniCPM-V 4.5 include:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;🔥 &lt;strong&gt;State-of-the-art Vision-Language Capability.&lt;/strong&gt;
MiniCPM-V 4.5 achieves an average score of 77.0 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. &lt;strong&gt;With only 8B parameters, it surpasses widely used proprietary models like GPT-4o-latest, Gemini-2.0 Pro, and strong open-source models like Qwen2.5-VL 72B&lt;/strong&gt; for vision-language capabilities, making it the most performant MLLM under 30B parameters.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;🎬 &lt;strong&gt;Efficient High-FPS and Long Video Understanding.&lt;/strong&gt; Powered by a new unified 3D-Resampler over images and videos, MiniCPM-V 4.5 can now achieve 96x compression rate for video tokens, where 6 448x448 video frames can be jointly compressed into 64 video tokens (normally 1,536 tokens for most MLLMs). This means that the model can perceive significantly more video frames without increasing the LLM inference cost. This brings state-of-the-art high-FPS (up to 10FPS) video understanding and long video understanding capabilities on Video-MME, LVBench, MLVU, MotionBench, FavorBench, etc., efficiently.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;⚙️ &lt;strong&gt;Controllable Hybrid Fast/Deep Thinking.&lt;/strong&gt; MiniCPM-V 4.5 supports both fast thinking for efficient frequent usage with competitive performance, and deep thinking for more complex problem solving. To cover efficiency and performance trade-offs in different user scenarios, this fast/deep thinking mode can be switched in a highly controlled fashion.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;💪 &lt;strong&gt;Strong OCR, Document Parsing and Others.&lt;/strong&gt;
Based on &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/pdf/2403.11703&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA-UHD&lt;/a&gt; architecture, MiniCPM-V 4.5 can process high-resolution images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344), using 4x fewer visual tokens than most MLLMs. The model achieves &lt;strong&gt;leading performance on OCRBench, surpassing proprietary models such as GPT-4o-latest and Gemini 2.5&lt;/strong&gt;. It also achieves state-of-the-art performance for PDF document parsing capability on OmniDocBench among general MLLMs. Based on the latest &lt;a class=&#34;link&#34; href=&#34;https://github.com/RLHF-V/RLAIF-V/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLAIF-V&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/VisCPM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;VisCPM&lt;/a&gt; techniques, it features &lt;strong&gt;trustworthy behaviors&lt;/strong&gt;, outperforming GPT-4o-latest on MMHal-Bench, and supports &lt;strong&gt;multilingual capabilities&lt;/strong&gt; in more than 30 languages.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;💫  &lt;strong&gt;Easy Usage.&lt;/strong&gt;
MiniCPM-V 4.5 can be easily used in various ways: (1) &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/llama.cpp/blob/Support-MiniCPM-V-4.5/docs/multimodal/minicpmv4.5.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama.cpp&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/ollama/tree/MIniCPM-V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ollama&lt;/a&gt; support for efficient CPU inference on local devices, (2) &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;int4&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/AutoAWQ&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;AWQ&lt;/a&gt; format quantized models in 16 sizes, (3) &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/sglang/tree/main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SGLang&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;#efficient-inference-with-llamacpp-ollama-vllm&#34; &gt;vLLM&lt;/a&gt; support for high-throughput and memory-efficient inference, (4) fine-tuning on new domains and tasks with &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/transformers/tree/main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Transformers&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;./docs/llamafactory_train_and_infer.md&#34; &gt;LLaMA-Factory&lt;/a&gt;, (5) quick &lt;a class=&#34;link&#34; href=&#34;#chat-with-our-demo-on-gradio&#34; &gt;local WebUI demo&lt;/a&gt;, (6) optimized &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/MiniCPM-o-demo-iOS&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;local iOS app&lt;/a&gt; on iPhone and iPad, and (7) online web demo on &lt;a class=&#34;link&#34; href=&#34;http://101.126.42.235:30910/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;server&lt;/a&gt;. See our &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Cookbook&lt;/a&gt; for full usage!&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;key-techniques&#34;&gt;Key Techniques &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;div align=&#34;center&#34;&gt;
&lt;img src=&#34;./assets/minicpm-v-4dot5-framework.png&#34; , width=100%&gt;
&lt;/div&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Architechture: Unified 3D-Resampler for High-density Video Compression.&lt;/strong&gt; MiniCPM-V 4.5 introduces a 3D-Resampler that overcomes the performance-efficiency trade-off in video understanding. By grouping and jointly compressing up to 6 consecutive video frames into just 64 tokens (the same token count used for a single image in MiniCPM-V series), MiniCPM-V 4.5 achieves a 96× compression rate for video tokens. This allows the model to process more video frames without additional LLM computational cost, enabling high-FPS video and long video understanding. The architecture supports unified encoding for images, multi-image inputs, and videos, ensuring seamless capability and knowledge transfer.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Pre-training: Unified Learning for OCR and Knowledge from Documents.&lt;/strong&gt; Existing MLLMs learn OCR capability and knowledge from documents in isolated training approaches. We observe that the essential difference between these two training approaches is the visibility of the text in images. By dynamically corrupting text regions in documents with varying noise levels and asking the model to reconstruct the text, the model learns to adaptively and properly switch between accurate text recognition (when text is visible) and multimodal context-based knowledge reasoning (when text is heavily obscured). This eliminates reliance on error-prone document parsers in knowledge learning from documents, and prevents hallucinations from over-augmented OCR data, resulting in top-tier OCR and multimodal knowledge performance with minimal engineering overhead.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Post-training: Hybrid Fast/Deep Thinking with Multimodal RL.&lt;/strong&gt; MiniCPM-V 4.5 offers a balanced reasoning experience through two switchable modes: fast thinking for efficient daily use and deep thinking for complex tasks. Using a new hybrid reinforcement learning method, the model jointly optimizes both modes, significantly enhancing fast-mode performance without compromising deep-mode capability. Incorporated with &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/RLPR&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLPR&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/RLHF-V/RLAIF-V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLAIF-V&lt;/a&gt;, it generalizes robust reasoning skills from broad multimodal data while effectively reducing hallucinations.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;evaluation&#34;&gt;Evaluation  &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;div align=&#34;center&#34;&gt;
  &lt;img src=&#34;./assets/radar_minicpm_v45.png&#34;, width=60%&gt;
&lt;/div&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;img src=&#34;./assets/minicpmv_4_5_evaluation_result.png&#34; , width=80%&gt;
&lt;/div&gt;
&lt;h3 id=&#34;inference-efficiency&#34;&gt;Inference Efficiency
&lt;/h3&gt;&lt;p&gt;&lt;strong&gt;OpenCompass&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;left&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
            &lt;tr&gt;
              &lt;th align=&#34;left&#34;&gt;Model&lt;/th&gt;
              &lt;th&gt;Size&lt;/th&gt;
              &lt;th&gt;Avg Score ↑&lt;/th&gt;
              &lt;th&gt;Total Inference Time ↓&lt;/th&gt;
            &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GLM-4.1V-9B-Thinking&lt;/td&gt;
            &lt;td&gt;10.3B&lt;/td&gt;
            &lt;td&gt;76.6&lt;/td&gt;
            &lt;td&gt;17.5h&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiMo-VL-7B-RL&lt;/td&gt;
            &lt;td&gt;8.3B&lt;/td&gt;
            &lt;td&gt;76.4&lt;/td&gt;
            &lt;td&gt;11h&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-V 4.5&lt;/td&gt;
            &lt;td&gt;8.7B&lt;/td&gt;
            &lt;td&gt;&lt;b&gt;77.0&lt;/td&gt;
            &lt;td&gt;&lt;b&gt;7.5h&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p&gt;&lt;strong&gt;Video-MME&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;left&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
          &lt;tr&gt;
              &lt;th align=&#34;left&#34;&gt;Model&lt;/th&gt;
              &lt;th&gt;Size&lt;/th&gt;
              &lt;th&gt;Avg Score ↑&lt;/th&gt;
              &lt;th&gt;Total Inference Time ↓&lt;/th&gt;
              &lt;th&gt;GPU Mem ↓&lt;/th&gt;
          &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
          &lt;tr&gt;
              &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Qwen2.5-VL-7B-Instruct&lt;/td&gt;
              &lt;td&gt;8.3B&lt;/td&gt;
              &lt;td&gt;71.6&lt;/td&gt;
              &lt;td&gt;3h&lt;/td&gt;
              &lt;td&gt;60G&lt;/td&gt;
          &lt;/tr&gt;
          &lt;tr&gt;
              &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GLM-4.1V-9B-Thinking&lt;/td&gt;
              &lt;td&gt;10.3B&lt;/td&gt;
              &lt;td&gt;&lt;b&gt;73.6&lt;/td&gt;
              &lt;td&gt;2.63h&lt;/td&gt;
              &lt;td&gt;32G&lt;/td&gt;
          &lt;/tr&gt;
          &lt;tr&gt;
              &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-V 4.5&lt;/td&gt;
              &lt;td&gt;8.7B&lt;/td&gt;
              &lt;td&gt;73.5&lt;/td&gt;
              &lt;td&gt;&lt;b&gt;0.26h&lt;/td&gt;
              &lt;td&gt;&lt;b&gt;28G&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;p&gt;Both Video-MME and OpenCompass were evaluated using 8×A100 GPUs for inference. The reported inference time of Video-MME includes full model-side computation, and excludes the external cost of video frame extraction (dependent on specific frame extraction tools) for fair comparison.&lt;/p&gt;
&lt;h3 id=&#34;examples&#34;&gt;Examples  &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;div align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://www.youtube.com/watch?v=Cn23FujYMMU&#34;&gt;&lt;img src=&#34;./assets/minicpmv4_5/MiniCPM-V 4.5-8.26_img.jpeg&#34;, width=70%&gt;&lt;/a&gt;
&lt;/div&gt;
&lt;div style=&#34;display: flex; flex-direction: column; align-items: center;&#34;&gt;
  &lt;img src=&#34;assets/minicpmv4_5/en_case1.png&#34; alt=&#34;en_case1&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
  &lt;img src=&#34;assets/minicpmv4_5/en_case2.png&#34; alt=&#34;en_case2&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
  &lt;img src=&#34;assets/minicpmv4_5/en_case3.jpeg&#34; alt=&#34;en_case3&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
&lt;/div&gt;
&lt;details&gt;
&lt;summary&gt;Click to view more cases.&lt;/summary&gt;
&lt;div style=&#34;display: flex; flex-direction: column; align-items: center;&#34;&gt;
  &lt;img src=&#34;assets/minicpmv4_5/zh_extra.jpeg&#34; alt=&#34;zh_extra&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
&lt;/div&gt;
&lt;/details&gt;
&lt;p&gt;We deploy MiniCPM-V 4.5 on iPad M4 with &lt;a class=&#34;link&#34; href=&#34;https://github.com/tc-mb/MiniCPM-o-demo-iOS&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;iOS demo&lt;/a&gt;. The demo video is the raw screen recording without edition.&lt;/p&gt;
&lt;table align=&#34;center&#34;&gt; 
    &lt;p align=&#34;center&#34;&gt;
      &lt;img src=&#34;assets/minicpmv4_5/v45_en_handwriting.gif&#34; width=45%/&gt;
      &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;
      &lt;img src=&#34;assets/minicpmv4_5/v45_en_cot.gif&#34; width=45%/&gt;
    &lt;/p&gt;
    &lt;p align=&#34;center&#34;&gt;
      &lt;img src=&#34;assets/minicpmv4_5/v45_cn_handwriting.gif&#34; width=45%/&gt;
      &amp;nbsp;&amp;nbsp;&amp;nbsp;&amp;nbsp;
      &lt;img src=&#34;assets/minicpmv4_5/v45_cn_travel.gif&#34; width=45%/&gt;
    &lt;/p&gt;
&lt;/table&gt;
&lt;h2 id=&#34;minicpm-o-26&#34;&gt;MiniCPM-o 2.6
&lt;/h2&gt;&lt;p&gt;&lt;strong&gt;MiniCPM-o 2.6&lt;/strong&gt; is the latest and most capable model in the MiniCPM-o series. The model is built in an end-to-end fashion based on SigLip-400M, Whisper-medium-300M, ChatTTS-200M, and Qwen2.5-7B with a total of 8B parameters. It exhibits a significant performance improvement over MiniCPM-V 2.6, and introduces new features for real-time speech conversation and multimodal live streaming. Notable features of MiniCPM-o 2.6 include:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;🔥 &lt;strong&gt;Leading Visual Capability.&lt;/strong&gt;
MiniCPM-o 2.6 achieves an average score of 70.2 on OpenCompass, a comprehensive evaluation of 8 popular benchmarks. &lt;strong&gt;With only 8B parameters, it surpasses widely used proprietary models like GPT-4o-202405, Gemini 1.5 Pro, and Claude 3.5 Sonnet&lt;/strong&gt; for single image understanding. It also &lt;strong&gt;outperforms GPT-4V and Claude 3.5 Sonnet&lt;/strong&gt; in multi-image and video understanding, and shows promising in-context learning capability.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;🎙 &lt;strong&gt;State-of-the-art Speech Capability.&lt;/strong&gt; MiniCPM-o 2.6 supports &lt;strong&gt;bilingual real-time speech conversation with configurable voices&lt;/strong&gt; in English and Chinese. It &lt;strong&gt;outperforms GPT-4o-realtime on audio understanding tasks&lt;/strong&gt; such as ASR and STT translation, and shows &lt;strong&gt;state-of-the-art performance on speech conversation in both semantic and acoustic evaluations in the open-source community&lt;/strong&gt;. It also allows for fun features such as emotion/speed/style control, end-to-end voice cloning, role play, etc.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;🎬 &lt;strong&gt;Strong Multimodal Live Streaming Capability.&lt;/strong&gt; As a new feature, MiniCPM-o 2.6 can &lt;strong&gt;accept continuous video and audio streams independent of user queries, and support real-time speech interaction&lt;/strong&gt;. It &lt;strong&gt;outperforms GPT-4o-202408 and Claude 3.5 Sonnet and shows state-of-the-art performance in the open-source community on StreamingBench&lt;/strong&gt;, a comprehensive benchmark for real-time video understanding, omni-source (video &amp;amp; audio) understanding, and multimodal contextual understanding.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;💪 &lt;strong&gt;Strong OCR Capability and Others.&lt;/strong&gt;
Advancing popular visual capabilities from MiniCPM-V series, MiniCPM-o 2.6 can process images with any aspect ratio and up to 1.8 million pixels (e.g., 1344x1344). It achieves &lt;strong&gt;state-of-the-art performance on OCRBench for models under 25B, surpassing proprietary models such as GPT-4o-202405&lt;/strong&gt;.
Based on the latest &lt;a class=&#34;link&#34; href=&#34;https://github.com/RLHF-V/RLAIF-V/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLAIF-V&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/VisCPM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;VisCPM&lt;/a&gt; techniques, it features &lt;strong&gt;trustworthy behaviors&lt;/strong&gt;, outperforming GPT-4o and Claude 3.5 Sonnet on MMHal-Bench, and supports &lt;strong&gt;multilingual capabilities&lt;/strong&gt; on more than 30 languages.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;🚀 &lt;strong&gt;Superior Efficiency.&lt;/strong&gt;
In addition to its friendly size, MiniCPM-o 2.6 also shows &lt;strong&gt;state-of-the-art token density&lt;/strong&gt; (i.e., the number of pixels encoded into each visual token). &lt;strong&gt;It produces only 640 tokens when processing a 1.8M pixel image, which is 75% fewer than most models&lt;/strong&gt;. This directly improves the inference speed, first-token latency, memory usage, and power consumption. As a result, MiniCPM-o 2.6 can efficiently support &lt;strong&gt;multimodal live streaming&lt;/strong&gt; on end-side devices such as iPads.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;💫  &lt;strong&gt;Easy Usage.&lt;/strong&gt;
MiniCPM-o 2.6 can be easily used in various ways: (1) &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/llama.cpp/blob/minicpm-omni/examples/llava/README-minicpmo2.6.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama.cpp&lt;/a&gt; support for efficient CPU inference on local devices, (2) &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;int4&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF&lt;/a&gt; format quantized models in 16 sizes, (3) &lt;a class=&#34;link&#34; href=&#34;#efficient-inference-with-llamacpp-ollama-vllm&#34; &gt;vLLM&lt;/a&gt; support for high-throughput and memory-efficient inference, (4) fine-tuning on new domains and tasks with &lt;a class=&#34;link&#34; href=&#34;./docs/llamafactory_train_and_infer.md&#34; &gt;LLaMA-Factory&lt;/a&gt;, (5) quick &lt;a class=&#34;link&#34; href=&#34;#chat-with-our-demo-on-gradio&#34; &gt;local WebUI demo&lt;/a&gt;, and (6) online web demo on &lt;a class=&#34;link&#34; href=&#34;https://minicpm-omni-webdemo-us.modelbest.cn/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;server&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Model Architecture.&lt;/strong&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;End-to-end Omni-modal Architecture.&lt;/strong&gt; Different modality encoders/decoders are connected and trained in an &lt;strong&gt;end-to-end&lt;/strong&gt; fashion to fully exploit rich multimodal knowledge. The model is trained in a fully end-to-end manner with only CE loss.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Omni-modal Live Streaming Mechanism.&lt;/strong&gt; (1) We change the offline modality encoder/decoders into online ones for &lt;strong&gt;streaming inputs/outputs.&lt;/strong&gt; (2) We devise a &lt;strong&gt;time-division multiplexing (TDM) mechanism&lt;/strong&gt; for omni-modality streaming processing in the LLM backbone. It divides parallel omni-modality streams into sequential info within small periodic time slices.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Configurable Speech Modeling Design.&lt;/strong&gt; We devise a multimodal system prompt, including traditional text system prompt, and &lt;strong&gt;a new audio system prompt to determine the assistant voice&lt;/strong&gt;. This enables flexible voice configurations in inference time, and also facilitates end-to-end voice cloning and description-based voice creation.&lt;/li&gt;
&lt;/ul&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;img src=&#34;./assets/minicpm-o-26-framework-v2.png&#34; , width=80%&gt;
&lt;/div&gt;
&lt;h3 id=&#34;evaluation-1&#34;&gt;Evaluation  &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;div align=&#34;center&#34;&gt;
  &lt;img src=&#34;./assets/radar.jpg&#34;, width=80%&gt;
&lt;/div&gt;
&lt;details&gt;
&lt;summary&gt;Click to view visual understanding results.&lt;/summary&gt;
&lt;p&gt;&lt;strong&gt;Image Understanding&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Model&lt;/th&gt;
            &lt;th&gt;Size&lt;/th&gt;
            &lt;th&gt;Token Density&lt;sup&gt;+&lt;/sup&gt;&lt;/th&gt;
            &lt;th&gt;OpenCompass&lt;/th&gt;
            &lt;th&gt;OCRBench&lt;/th&gt;
            &lt;th&gt;MathVista mini&lt;/th&gt;
            &lt;th&gt;ChartQA&lt;/th&gt;
            &lt;th&gt;MMVet&lt;/th&gt;
            &lt;th&gt;MMStar&lt;/th&gt;
            &lt;th&gt;MME&lt;/th&gt;
            &lt;th&gt;MMB1.1 test&lt;/th&gt;
            &lt;th&gt;AI2D&lt;/th&gt;
            &lt;th&gt;MMMU val&lt;/th&gt;
            &lt;th&gt;HallusionBench&lt;/th&gt;
            &lt;th&gt;TextVQA val&lt;/th&gt;
            &lt;th&gt;DocVQA test&lt;/th&gt;
            &lt;th&gt;MathVerse mini&lt;/th&gt;
            &lt;th&gt;MathVision&lt;/th&gt;
            &lt;th&gt;MMHal Score&lt;/th&gt;
        &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;19&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Proprietary&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT-4o-20240513&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;1088&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;69.9&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;736&lt;/td&gt;
            &lt;td&gt;61.3&lt;/td&gt;
            &lt;td&gt;85.7&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;69.1&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;63.9&lt;/td&gt;
            &lt;td&gt;2328.7&lt;/td&gt;
            &lt;td&gt;82.2&lt;/td&gt;
            &lt;td&gt;84.6&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;69.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;55.0&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;92.8&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;50.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;30.4&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;3.6&lt;/u&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Claude3.5-Sonnet&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;750&lt;/td&gt;
            &lt;td&gt;67.9&lt;/td&gt;
            &lt;td&gt;788&lt;/td&gt;
            &lt;td&gt;61.6&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;90.8&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;66.0&lt;/td&gt;
            &lt;td&gt;62.2&lt;/td&gt;
            &lt;td&gt;1920.0&lt;/td&gt;
            &lt;td&gt;78.5&lt;/td&gt;
            &lt;td&gt;80.2&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;65.9&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;49.9&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;95.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;3.4&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Gemini 1.5 Pro&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;64.4&lt;/td&gt;
            &lt;td&gt;754&lt;/td&gt;
            &lt;td&gt;57.7&lt;/td&gt;
            &lt;td&gt;81.3&lt;/td&gt;
            &lt;td&gt;64.0&lt;/td&gt;
            &lt;td&gt;59.1&lt;/td&gt;
            &lt;td&gt;2110.6&lt;/td&gt;
            &lt;td&gt;73.9&lt;/td&gt;
            &lt;td&gt;79.1&lt;/td&gt;
            &lt;td&gt;60.6&lt;/td&gt;
            &lt;td&gt;45.6&lt;/td&gt;
            &lt;td&gt;73.5&lt;/td&gt;
            &lt;td&gt;86.5&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;19.2&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT-4o-mini-20240718&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;1088&lt;/td&gt;
            &lt;td&gt;64.1&lt;/td&gt;
            &lt;td&gt;785&lt;/td&gt;
            &lt;td&gt;52.4&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;66.9&lt;/td&gt;
            &lt;td&gt;54.8&lt;/td&gt;
            &lt;td&gt;2003.4&lt;/td&gt;
            &lt;td&gt;76.0&lt;/td&gt;
            &lt;td&gt;77.8&lt;/td&gt;
            &lt;td&gt;60.0&lt;/td&gt;
            &lt;td&gt;46.1&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;3.3&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;19&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Open Source&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Cambrian-34B&lt;/td&gt;
            &lt;td&gt;34B&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;1820&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;58.3&lt;/td&gt;
            &lt;td&gt;591&lt;/td&gt;
            &lt;td&gt;50.3&lt;/td&gt;
            &lt;td&gt;75.6&lt;/td&gt;
            &lt;td&gt;53.2&lt;/td&gt;
            &lt;td&gt;54.2&lt;/td&gt;
            &lt;td&gt;2049.9&lt;/td&gt;
            &lt;td&gt;77.8&lt;/td&gt;
            &lt;td&gt;79.5&lt;/td&gt;
            &lt;td&gt;50.4&lt;/td&gt;
            &lt;td&gt;41.6&lt;/td&gt;
            &lt;td&gt;76.7&lt;/td&gt;
            &lt;td&gt;75.5&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GLM-4V-9B&lt;/td&gt;
            &lt;td&gt;13B&lt;/td&gt;
            &lt;td&gt;784&lt;/td&gt;
            &lt;td&gt;59.1&lt;/td&gt;
            &lt;td&gt;776&lt;/td&gt;
            &lt;td&gt;51.1&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;58.0&lt;/td&gt;
            &lt;td&gt;54.8&lt;/td&gt;
            &lt;td&gt;2018.8&lt;/td&gt;
            &lt;td&gt;67.9&lt;/td&gt;
            &lt;td&gt;71.2&lt;/td&gt;
            &lt;td&gt;46.9&lt;/td&gt;
            &lt;td&gt;45.0&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Pixtral-12B&lt;/td&gt;
            &lt;td&gt;12B&lt;/td&gt;
            &lt;td&gt;256&lt;/td&gt;
            &lt;td&gt;61.0&lt;/td&gt;
            &lt;td&gt;685&lt;/td&gt;
            &lt;td&gt;56.9&lt;/td&gt;
            &lt;td&gt;81.8&lt;/td&gt;
            &lt;td&gt;58.5&lt;/td&gt;
            &lt;td&gt;54.5&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;72.7&lt;/td&gt;
            &lt;td&gt;79.0&lt;/td&gt;
            &lt;td&gt;51.1&lt;/td&gt;
            &lt;td&gt;47.0&lt;/td&gt;
            &lt;td&gt;75.7&lt;/td&gt;
            &lt;td&gt;90.7&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;VITA-1.5&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;784&lt;/td&gt;
            &lt;td&gt;63.3&lt;/td&gt;
            &lt;td&gt;741&lt;/td&gt;
            &lt;td&gt;66.2&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;52.7&lt;/td&gt;
            &lt;td&gt;60.2&lt;/td&gt;
            &lt;td&gt;2328.1&lt;/td&gt;
            &lt;td&gt;76.8&lt;/td&gt;
            &lt;td&gt;79.2&lt;/td&gt;
            &lt;td&gt;52.6&lt;/td&gt;
            &lt;td&gt;44.6&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;DeepSeek-VL2-27B (4B)&lt;/td&gt;
            &lt;td&gt;27B&lt;/td&gt;
            &lt;td&gt;672&lt;/td&gt;
            &lt;td&gt;66.4&lt;/td&gt;
            &lt;td&gt;809&lt;/td&gt;
            &lt;td&gt;63.9&lt;/td&gt;
            &lt;td&gt;86.0&lt;/td&gt;
            &lt;td&gt;60.0&lt;/td&gt;
            &lt;td&gt;61.9&lt;/td&gt;
            &lt;td&gt;2253.0&lt;/td&gt;
            &lt;td&gt;81.2&lt;/td&gt;
            &lt;td&gt;83.8&lt;/td&gt;
            &lt;td&gt;54.0&lt;/td&gt;
            &lt;td&gt;45.3&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;84.2&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;93.3&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;3.0&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Qwen2-VL-7B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;784&lt;/td&gt;
            &lt;td&gt;67.1&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;866&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;58.2&lt;/td&gt;
            &lt;td&gt;83.0&lt;/td&gt;
            &lt;td&gt;62.0&lt;/td&gt;
            &lt;td&gt;60.7&lt;/td&gt;
            &lt;td&gt;2326.0&lt;/td&gt;
            &lt;td&gt;81.8&lt;/td&gt;
            &lt;td&gt;83.0&lt;/td&gt;
            &lt;td&gt;54.1&lt;/td&gt;
            &lt;td&gt;50.6&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;84.3&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;94.5&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;31.9&lt;/td&gt;
            &lt;td&gt;16.3&lt;/td&gt;
            &lt;td&gt;3.2&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;LLaVA-OneVision-72B&lt;/td&gt;
            &lt;td&gt;72B&lt;/td&gt;
            &lt;td&gt;182&lt;/td&gt;
            &lt;td&gt;68.1&lt;/td&gt;
            &lt;td&gt;741&lt;/td&gt;
            &lt;td&gt;67.5&lt;/td&gt;
            &lt;td&gt;83.7&lt;/td&gt;
            &lt;td&gt;60.6&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;65.8&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;2261.0&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;85.0&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;85.6&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;56.8&lt;/td&gt;
            &lt;td&gt;49.0&lt;/td&gt;
            &lt;td&gt;80.5&lt;/td&gt;
            &lt;td&gt;91.3&lt;/td&gt;
            &lt;td&gt;39.1&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;3.5&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;InternVL2.5-8B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;706&lt;/td&gt;
            &lt;td&gt;68.3&lt;/td&gt;
            &lt;td&gt;822&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;64.4&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;84.8&lt;/td&gt;
            &lt;td&gt;62.8&lt;/td&gt;
            &lt;td&gt;62.8&lt;/td&gt;
            &lt;td&gt;2344.0&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;83.6&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;84.5&lt;/td&gt;
            &lt;td&gt;56.0&lt;/td&gt;
            &lt;td&gt;50.1&lt;/td&gt;
            &lt;td&gt;79.1&lt;/td&gt;
            &lt;td&gt;93.0&lt;/td&gt;
            &lt;td&gt;39.5&lt;/td&gt;
            &lt;td&gt;19.7&lt;/td&gt;
            &lt;td&gt;3.4&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-V 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;2822&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;65.2&lt;/td&gt;
            &lt;td&gt;852*&lt;/td&gt;
            &lt;td&gt;60.6&lt;/td&gt;
            &lt;td&gt;79.4&lt;/td&gt;
            &lt;td&gt;60.0&lt;/td&gt;
            &lt;td&gt;57.5&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;2348.4*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;78.0&lt;/td&gt;
            &lt;td&gt;82.1&lt;/td&gt;
            &lt;td&gt;49.8*&lt;/td&gt;
            &lt;td&gt;48.1*&lt;/td&gt;
            &lt;td&gt;80.1&lt;/td&gt;
            &lt;td&gt;90.8&lt;/td&gt;
            &lt;td&gt;25.7&lt;/td&gt;
            &lt;td&gt;18.3&lt;/td&gt;
            &lt;td&gt;3.6&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;2822&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;70.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;897*&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;71.9*&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;86.9*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;67.5&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;64.0&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;2372.0*&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;80.5&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;85.8&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;50.4*&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;51.9&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;82.0&lt;/td&gt;
            &lt;td&gt;93.5&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;41.4*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;23.1*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;3.8&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
* We evaluate this benchmark using chain-of-thought prompting. Specifically, for MME, we used this technique only for the Cognition set.
&lt;p&gt;&lt;sup&gt;+&lt;/sup&gt; Token Density: number of pixels encoded into each visual token at maximum resolution, i.e., # pixels at maximum resolution / # visual tokens.&lt;/p&gt;
&lt;p&gt;Note: For proprietary models, we calculate token density based on the image encoding charging strategy defined in the official API documentation, which provides an upper-bound estimation.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Multi-image and Video Understanding&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Model&lt;/th&gt;
            &lt;th&gt;Size&lt;/th&gt;
            &lt;th&gt;BLINK val&lt;/th&gt;
            &lt;th&gt;Mantis Eval&lt;/th&gt;
            &lt;th&gt;MIRB&lt;/th&gt;
            &lt;th&gt;Video-MME (wo / w subs)&lt;/th&gt;
        &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;6&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Proprietary&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT-4o-20240513&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;68.0&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;71.9/77.2&lt;strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT4V&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;54.6&lt;/td&gt;
            &lt;td&gt;62.7&lt;/td&gt;
            &lt;td&gt;53.1&lt;/td&gt;
            &lt;td&gt;59.9/63.3&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;6&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Open-source&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;VITA-1.5&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;45.0&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;56.1/58.7&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;LLaVA-NeXT-Interleave 14B&lt;/td&gt;
            &lt;td&gt;14B&lt;/td&gt;
            &lt;td&gt;52.6&lt;/td&gt;
            &lt;td&gt;66.4&lt;/td&gt;
            &lt;td&gt;30.2&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;LLaVA-OneVision-72B&lt;/td&gt;
            &lt;td&gt;72B&lt;/td&gt;
            &lt;td&gt;55.4&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;77.6&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;66.2/69.5&lt;/u&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MANTIS 8B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;49.1&lt;/td&gt;
            &lt;td&gt;59.5&lt;/td&gt;
            &lt;td&gt;34.8&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Qwen2-VL-7B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;53.2&lt;/td&gt;
            &lt;td&gt;69.6*&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;67.6*&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;63.3/69.0&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;InternVL2.5-8B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;54.8&lt;/td&gt;
            &lt;td&gt;67.7&lt;/td&gt;
            &lt;td&gt;52.5&lt;/td&gt;
            &lt;td&gt;64.2/66.9&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-V 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;53.0&lt;/td&gt;
            &lt;td&gt;69.1&lt;/td&gt;
            &lt;td&gt;53.8&lt;/td&gt;
            &lt;td&gt;60.9/63.6&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;56.7&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;71.9&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;58.6&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;63.9/67.9&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
* We evaluate officially released checkpoints by ourselves.
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Click to view audio understanding and speech conversation results.&lt;/summary&gt;
&lt;p&gt;&lt;strong&gt;Audio Understanding&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Task&lt;/th&gt;
            &lt;th&gt;Size&lt;/th&gt;
            &lt;th colspan=&#34;3&#34;&gt;ASR (zh)&lt;/th&gt;
            &lt;th colspan=&#34;3&#34;&gt;ASR (en)&lt;/th&gt;
            &lt;th colspan=&#34;2&#34;&gt;AST&lt;/th&gt;
            &lt;th&gt;Emotion&lt;/th&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Metric&lt;/th&gt;
            &lt;td&gt;&lt;/td&gt;
            &lt;th colspan=&#34;3&#34;&gt;CER↓&lt;/th&gt;
            &lt;th colspan=&#34;3&#34;&gt;WER↓&lt;/th&gt;
            &lt;th colspan=&#34;2&#34;&gt;BLEU↑&lt;/th&gt;
            &lt;th&gt;ACC↑&lt;/th&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Dataset&lt;/th&gt;
            &lt;td&gt;&lt;/td&gt;
            &lt;th&gt;AISHELL-1&lt;/th&gt;
            &lt;th&gt;Fleurs zh&lt;/th&gt;
            &lt;th&gt;WenetSpeech test-net&lt;/th&gt;
            &lt;th&gt;LibriSpeech test-clean&lt;/th&gt;
            &lt;th&gt;GigaSpeech&lt;/th&gt;
            &lt;th&gt;TED-LIUM&lt;/th&gt;
            &lt;th&gt;CoVoST en2zh&lt;/th&gt;
            &lt;th&gt;CoVoST zh2en&lt;/th&gt;
            &lt;th&gt;MELD emotion&lt;/th&gt;
        &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;11&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Proprietary&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT-4o-Realtime&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;7.3*&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;5.4*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;28.9*&lt;/td&gt;
            &lt;td&gt;2.6*&lt;/td&gt;
            &lt;td&gt;12.9*&lt;/td&gt;
            &lt;td&gt;4.8*&lt;/td&gt;
            &lt;td&gt;37.1*&lt;/td&gt;
            &lt;td&gt;15.7*&lt;/td&gt;
            &lt;td&gt;33.2*&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Gemini 1.5 Pro&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;4.5*&lt;/td&gt;
            &lt;td&gt;5.9*&lt;/td&gt;
            &lt;td&gt;14.3*&lt;/td&gt;
            &lt;td&gt;2.9*&lt;/td&gt;
            &lt;td&gt;10.6*&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;3.0*&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;47.3*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;22.6*&lt;/td&gt;
            &lt;td&gt;48.4*&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;11&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Open-Source&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Qwen2-Audio-7B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;7.5&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;1.6&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;45.2&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;24.4&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;55.3&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Qwen2-Audio-7B-Instruct&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;2.6*&lt;/td&gt;
            &lt;td&gt;6.9*&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;10.3*&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;3.1*&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;9.7&lt;/u&gt;*&lt;/td&gt;
            &lt;td&gt;5.9*&lt;/td&gt;
            &lt;td&gt;39.5*&lt;/td&gt;
            &lt;td&gt;22.9*&lt;/td&gt;
            &lt;td&gt;17.4*&lt;/td&gt;
        &lt;/tr&gt;
          &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;VITA-1.5&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;2.16&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;8.4&lt;/td&gt;
            &lt;td&gt;3.4&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GLM-4-Voice-Base&lt;/td&gt;
            &lt;td&gt;9B&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;2.5&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;2.8&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;1.6&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;4.4&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;6.9&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;1.7&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;8.7&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;3.0&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;48.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;27.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;52.4&lt;/u&gt;&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
* We evaluate officially released checkpoints by ourselves.&lt;br&gt;&lt;br&gt;
&lt;p&gt;&lt;strong&gt;Speech Generation&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Task&lt;/th&gt;
            &lt;th&gt;Size&lt;/th&gt;
            &lt;th colspan=&#34;9&#34;&gt;SpeechQA&lt;/th&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Metric&lt;/th&gt;
            &lt;th&gt;&lt;/th&gt;
            &lt;th colspan=&#34;3&#34;&gt;ACC↑&lt;/th&gt;
            &lt;th&gt;G-Eval (10 point)↑&lt;/th&gt;
            &lt;th&gt;Semantic ELO score↑&lt;/th&gt;
            &lt;th&gt;Acoustic ELO score↑&lt;/th&gt;
            &lt;th&gt;Overall ELO score↑&lt;/th&gt;
            &lt;th&gt;UTMOS↑&lt;/th&gt;
            &lt;th&gt;ASR-WER↓&lt;/th&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Dataset&lt;/th&gt;
            &lt;th&gt;&lt;/th&gt;
            &lt;th&gt;Speech Llama Q.&lt;/th&gt;
            &lt;th&gt;Speech Web Q.&lt;/th&gt;
            &lt;th&gt;Speech Trivia QA&lt;/th&gt;
            &lt;th&gt;Speech AlpacaEval&lt;/th&gt;
            &lt;th colspan=&#34;5&#34;&gt;AudioArena&lt;/th&gt;
        &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;11&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Proprietary&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT-4o-Realtime&lt;/td&gt;
            &lt;td&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;71.7&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;51.6&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;69.7&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;7.4&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;1157&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;1203&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;1200&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;4.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;2.3&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;11&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Open-Source&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GLM-4-Voice&lt;/td&gt;
            &lt;td&gt;9B&lt;/td&gt;
            &lt;td&gt;50.0&lt;/td&gt;
            &lt;td&gt;32.0&lt;/td&gt;
            &lt;td&gt;36.4&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;5.1&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;999&lt;/td&gt;
            &lt;td&gt;1147&lt;/td&gt;
            &lt;td&gt;1035&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;4.1&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;11.7&lt;/u&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Llama-Omni&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;45.3&lt;/td&gt;
            &lt;td&gt;22.9&lt;/td&gt;
            &lt;td&gt;10.7&lt;/td&gt;
            &lt;td&gt;3.9&lt;/td&gt;
            &lt;td&gt;960&lt;/td&gt;
            &lt;td&gt;878&lt;/td&gt;
            &lt;td&gt;897&lt;/td&gt;
            &lt;td&gt;3.2&lt;/td&gt;
            &lt;td&gt;24.3&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;VITA-1.5&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;46.7&lt;/td&gt;
            &lt;td&gt;28.1&lt;/td&gt;
            &lt;td&gt;23.3&lt;/td&gt;
            &lt;td&gt;2.0&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Moshi&lt;/td&gt;
            &lt;td&gt;7B&lt;/td&gt;
            &lt;td&gt;43.7&lt;/td&gt;
            &lt;td&gt;23.8&lt;/td&gt;
            &lt;td&gt;16.7&lt;/td&gt;
            &lt;td&gt;2.4&lt;/td&gt;
            &lt;td&gt;871&lt;/td&gt;
            &lt;td&gt;808&lt;/td&gt;
            &lt;td&gt;875&lt;/td&gt;
            &lt;td&gt;2.8&lt;/td&gt;
            &lt;td&gt;8.2&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Mini-Omni&lt;/td&gt;
            &lt;td&gt;1B&lt;/td&gt;
            &lt;td&gt;22.0&lt;/td&gt;
            &lt;td&gt;12.8&lt;/td&gt;
            &lt;td&gt;6.9&lt;/td&gt;
            &lt;td&gt;2.5&lt;/td&gt;
            &lt;td&gt;926&lt;/td&gt;
            &lt;td&gt;803&lt;/td&gt;
            &lt;td&gt;865&lt;/td&gt;
            &lt;td&gt;3.4&lt;/td&gt;
            &lt;td&gt;10.0&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;61.0&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;40.0&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;40.2&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;5.1&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;1088&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;1163&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;1131&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;4.2&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;9.8&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
All results are from AudioEvals, and the evaluation methods along with further details can be found in &lt;a href=&#34;https://github.com/OpenBMB/UltraEval-Audio&#34; target=&#34;_blank&#34;&gt;AudioEvals&lt;/a&gt;.&lt;br&gt;&lt;br&gt;
&lt;p&gt;&lt;strong&gt;End-to-end Voice Cloning&lt;/strong&gt;&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Task&lt;/th&gt;
            &lt;th colspan=&#34;2&#34;&gt;Voice cloning&lt;/th&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Metric&lt;/th&gt;
            &lt;th&gt;SIMO↑&lt;/th&gt;
            &lt;th&gt;SIMO↑&lt;/th&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Dataset&lt;/th&gt;
            &lt;th&gt;Seed-TTS test-zh&lt;/th&gt;
            &lt;th&gt;Seed-TTS test-en&lt;/th&gt;
        &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;F5-TTS&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;76&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;67&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;CosyVoice&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;75&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;64&lt;/u&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;FireRedTTS&lt;/td&gt;
            &lt;td&gt;63&lt;/td&gt;
            &lt;td&gt;46&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
            &lt;td&gt;57&lt;/td&gt;
            &lt;td&gt;47&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/div&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Click to view multimodal live streaming results.&lt;/summary&gt;
&lt;p&gt;&lt;strong&gt;Multimodal Live Streaming&lt;/strong&gt;: results on StreamingBench&lt;/p&gt;
&lt;table style=&#34;margin: 0px auto;&#34;&gt;
    &lt;thead&gt;
        &lt;tr&gt;
            &lt;th align=&#34;left&#34;&gt;Model&lt;/th&gt;
            &lt;th&gt;Size&lt;/th&gt;
            &lt;th&gt;Real-Time Video Understanding&lt;/th&gt;
            &lt;th&gt;Omni-Source Understanding&lt;/th&gt;
            &lt;th&gt;Contextual Understanding&lt;/th&gt;
            &lt;th&gt;Overall&lt;/th&gt;
        &lt;/tr&gt;
    &lt;/thead&gt;
    &lt;tbody align=&#34;center&#34;&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;7&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Proprietary&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Gemini 1.5 Pro&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;77.4&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;67.8&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;51.1&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;70.3&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;GPT-4o-202408&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;74.5&lt;/td&gt;
            &lt;td&gt;51.0&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;48.0&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;64.1&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Claude-3.5-Sonnet&lt;/td&gt;
            &lt;td&gt;-&lt;/td&gt;
            &lt;td&gt;74.0&lt;/td&gt;
            &lt;td&gt;41.4&lt;/td&gt;
            &lt;td&gt;37.8&lt;/td&gt;
            &lt;td&gt;59.7&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td colspan=&#34;9&#34; align=&#34;left&#34;&gt;&lt;strong&gt;Open-source&lt;/strong&gt;&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;VILA-1.5&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;61.5&lt;/td&gt;
            &lt;td&gt;37.5&lt;/td&gt;
            &lt;td&gt;26.7&lt;/td&gt;
            &lt;td&gt;49.5&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;LongVA&lt;/td&gt;
            &lt;td&gt;7B&lt;/td&gt;
            &lt;td&gt;63.1&lt;/td&gt;
            &lt;td&gt;35.9&lt;/td&gt;
            &lt;td&gt;30.2&lt;/td&gt;
            &lt;td&gt;50.7&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;LLaVA-Next-Video-34B&lt;/td&gt;
            &lt;td&gt;34B&lt;/td&gt;
            &lt;td&gt;69.8&lt;/td&gt;
            &lt;td&gt;41.7&lt;/td&gt;
            &lt;td&gt;34.3&lt;/td&gt;
            &lt;td&gt;56.7&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;Qwen2-VL-7B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;71.2&lt;/td&gt;
            &lt;td&gt;40.7&lt;/td&gt;
            &lt;td&gt;33.1&lt;/td&gt;
            &lt;td&gt;57.0&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;InternVL2-8B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;70.1&lt;/td&gt;
            &lt;td&gt;42.7&lt;/td&gt;
            &lt;td&gt;34.1&lt;/td&gt;
            &lt;td&gt;57.0&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;VITA-1.5&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;70.9&lt;/td&gt;
            &lt;td&gt;40.8&lt;/td&gt;
            &lt;td&gt;35.8&lt;/td&gt;
            &lt;td&gt;57.4&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;LLaVA-OneVision-7B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;74.3&lt;/td&gt;
            &lt;td&gt;40.8&lt;/td&gt;
            &lt;td&gt;31.0&lt;/td&gt;
            &lt;td&gt;58.4&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;InternLM-XC2.5-OL-7B&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;75.4&lt;/td&gt;
            &lt;td&gt;46.2&lt;/td&gt;
            &lt;td&gt;33.6&lt;/td&gt;
            &lt;td&gt;60.8&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-V 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;72.4&lt;/td&gt;
            &lt;td&gt;40.2&lt;/td&gt;
            &lt;td&gt;33.4&lt;/td&gt;
            &lt;td&gt;57.7&lt;/td&gt;
        &lt;/tr&gt;
        &lt;tr&gt;
            &lt;td nowrap=&#34;nowrap&#34; align=&#34;left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
            &lt;td&gt;8B&lt;/td&gt;
            &lt;td&gt;&lt;strong&gt;79.9&lt;/strong&gt;&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;53.4&lt;/u&gt;&lt;/td&gt;
            &lt;td&gt;38.5&lt;/td&gt;
            &lt;td&gt;&lt;u&gt;66.0&lt;/u&gt;&lt;/td&gt;
        &lt;/tr&gt;
    &lt;/tbody&gt;
&lt;/table&gt;
&lt;/details&gt;
&lt;h3 id=&#34;examples-1&#34;&gt;Examples &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;We deploy MiniCPM-o 2.6 on end devices. The demo video is the raw-speed recording on an iPad Pro and a Web demo.&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://www.youtube.com/watch?v=vRIMbxJzStY&amp;t=2s&#34;&gt;&lt;img src=&#34;./assets/minicpmo2_6/2dot6_o_demo_video_img.png&#34;, width=70%&gt;&lt;/a&gt;
&lt;/div&gt;
&lt;br&gt;
&lt;div style=&#34;display: flex; flex-direction: column; align-items: center;&#34;&gt;
  &lt;img src=&#34;assets/minicpmo2_6/minicpmo2_6_math_intersect.png&#34; alt=&#34;math&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
  &lt;img src=&#34;assets/minicpmo2_6/minicpmo2_6_diagram_train_NN.png&#34; alt=&#34;diagram&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
  &lt;img src=&#34;assets/minicpmo2_6/minicpmo2_6_multi-image_bike.png&#34; alt=&#34;bike&#34; style=&#34;margin-bottom: 5px;&#34;&gt;
&lt;/div&gt;
&lt;h2 id=&#34;legacy-models&#34;&gt;Legacy Models &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th style=&#34;text-align: left&#34;&gt;Model&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Introduction and Guidance&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 4.0&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;./docs/minicpm_v4_en.md&#34; &gt;Document&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 2.6&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;./docs/minicpm_v2dot6_en.md&#34; &gt;Document&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-Llama3-V 2.5&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;./docs/minicpm_llama3_v2dot5.md&#34; &gt;Document&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 2.0&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;./docs/minicpm_v2.md&#34; &gt;Document&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 1.0&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;./docs/minicpm_v1.md&#34; &gt;Document&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;OmniLMM-12B&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;././docs/omnilmm_en.md&#34; &gt;Document&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;minicpm-v--o-cookbook&#34;&gt;MiniCPM-V &amp;amp; o Cookbook
&lt;/h2&gt;&lt;p&gt;Discover comprehensive, ready-to-deploy solutions for the MiniCPM-V and MiniCPM-o model series in our structured &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;cookbook&lt;/a&gt;, which empowers developers to rapidly implement multimodal AI applications with integrated vision, speech, and live-streaming capabilities. Key features include:&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Easy Usage Documentation&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Our comprehensive &lt;a class=&#34;link&#34; href=&#34;https://minicpm-o.readthedocs.io/en/latest/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;documentation website&lt;/a&gt; presents every recipe in a clear, well-organized manner.
All features are displayed at a glance, making it easy for you to quickly find exactly what you need.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Broad User Spectrum&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;We support a wide range of users, from individuals to enterprises and researchers.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Individuals&lt;/strong&gt;: Enjoy effortless inference using &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/ollama/minicpm-v4_ollama.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/llama.cpp/minicpm-v4_llamacpp.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama.cpp&lt;/a&gt; with minimal setup.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Enterprises&lt;/strong&gt;: Achieve high-throughput, scalable performance with &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/vllm/minicpm-v4_vllm.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;vLLM&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/deployment/sglang/MiniCPM-v4_sglang.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SGLang&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Researchers&lt;/strong&gt;: Leverage advanced frameworks including &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_full.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Transformers&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/finetune_llamafactory.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA-Factory&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/swift.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SWIFT&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/finetune/align_anything.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Align-anything&lt;/a&gt; to enable flexible model development and cutting-edge experimentation.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;strong&gt;Versatile Deployment Scenarios&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;Our ecosystem delivers optimal solution for a variety of hardware environments and deployment demands.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Web demo&lt;/strong&gt;: Launch interactive multimodal AI web demo with &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FastAPI&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Quantized deployment&lt;/strong&gt;: Maximize efficiency and minimize resource consumption using &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/gguf/minicpm-v4_gguf_quantize.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/quantization/bnb/minicpm-v4_bnb_quantize.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BNB&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;End devices&lt;/strong&gt;: Bring powerful AI experiences to &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenSQZ/MiniCPM-V-CookBook/blob/main/demo/ios_demo/ios.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;iPhone and iPad&lt;/a&gt;, supporting offline and privacy-sensitive applications.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;chat-with-our-demo-on-gradio-&#34;&gt;Chat with Our Demo on Gradio 🤗
&lt;/h2&gt;&lt;p&gt;We provide online and local demos powered by Hugging Face Gradio &lt;a href=&#39;https://github.com/gradio-app/gradio&#39;&gt;&lt;img src=&#39;https://img.shields.io/github/stars/gradio-app/gradio&#39;&gt;&lt;/a&gt;, the most popular model deployment framework nowadays. It supports streaming outputs, progress bars, queuing, alerts, and other useful features.&lt;/p&gt;
&lt;h3 id=&#34;online-demo&#34;&gt;Online Demo &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;Click here to try out the online demo of &lt;a class=&#34;link&#34; href=&#34;https://minicpm-omni-webdemo-us.modelbest.cn/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-o 2.6&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;http://120.92.209.146:8887/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V 2.6&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/openbmb/MiniCPM-Llama3-V-2_5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-Llama3-V 2.5&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/openbmb/MiniCPM-V-2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V 2.0&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;local-webui-demo&#34;&gt;Local WebUI Demo &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;You can easily build your own local WebUI demo using the following commands.&lt;/p&gt;
&lt;p&gt;Please ensure that &lt;code&gt;transformers==4.44.2&lt;/code&gt; is installed, as other versions may have compatibility issues.&lt;/p&gt;
&lt;p&gt;If you are using an older version of PyTorch, you might encounter this issue &lt;code&gt;&amp;quot;weight_norm_fwd_first_dim_kernel&amp;quot; not implemented for &#39;BFloat16&#39;&lt;/code&gt;, Please add &lt;code&gt;self.minicpmo_model.tts.float()&lt;/code&gt; during the model initialization.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;For real-time voice/video call demo:&lt;/strong&gt;&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;launch model server:&lt;/li&gt;
&lt;/ol&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -r requirements_o2.6.txt
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;python web_demos/minicpm-o_2.6/model_server.py
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ol start=&#34;2&#34;&gt;
&lt;li&gt;launch web server:&lt;/li&gt;
&lt;/ol&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Make sure Node and PNPM is installed.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;sudo apt-get update
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;sudo apt-get install nodejs npm
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;npm install -g pnpm
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; web_demos/minicpm-o_2.6/web_server
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# create ssl cert for https, https is required to request camera and microphone permissions.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;bash ./make_ssl_cert.sh  &lt;span class=&#34;c1&#34;&gt;# output key.pem and cert.pem&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pnpm install  &lt;span class=&#34;c1&#34;&gt;# install requirements&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pnpm run dev  &lt;span class=&#34;c1&#34;&gt;# start server&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Open &lt;code&gt;https://localhost:8088/&lt;/code&gt; in browser and enjoy the real-time voice/video call.&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;For chatbot demo:&lt;/strong&gt;&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -r requirements_o2.6.txt
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;python web_demos/minicpm-o_2.6/chatbot_web_demo_o2.6.py
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Open &lt;code&gt;http://localhost:8000/&lt;/code&gt; in browser and enjoy the vision mode chatbot.&lt;/p&gt;
&lt;h2 id=&#34;inference&#34;&gt;Inference
&lt;/h2&gt;&lt;h3 id=&#34;model-zoo&#34;&gt;Model Zoo
&lt;/h3&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th style=&#34;text-align: left&#34;&gt;Model&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Device&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Memory&lt;/th&gt;
          &lt;th style=&#34;text-align: left&#34;&gt;         Description&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Download&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 4.5&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;GPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;18 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The latest version, strong end-side multimodal performance for single image, multi-image and video understanding.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 4.5 gguf&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;CPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;8 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The gguf version, lower memory usage and faster inference.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 4.5 int4&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;GPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;9 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The int4 quantized version, lower GPU memory usage.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-V 4.5 AWQ&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;GPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;9 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The int4 quantized version, lower GPU memory usage.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-4_5-AWQ&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-V-4_5-AWQ&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-o 2.6&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;GPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;18 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The latest version, achieving GPT-4o level performance for vision, speech and multimodal live streaming on end-side devices.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-o 2.6 gguf&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;CPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;8 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The gguf version, lower memory usage and faster inference.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6-gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;MiniCPM-o 2.6 int4&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;GPU&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;9 GB&lt;/td&gt;
          &lt;td style=&#34;text-align: left&#34;&gt;The int4 quantized version, lower GPU memory usage.&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt;    &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/OpenBMB/MiniCPM-o-2_6-int4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;./assets/modelscope_logo.png&#34; width=&#34;20px&#34;&gt;&lt;/img&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h3 id=&#34;multi-turn-conversation&#34;&gt;Multi-turn Conversation
&lt;/h3&gt;&lt;p&gt;If you wish to enable long-thinking mode, provide the argument &lt;code&gt;enable_thinking=True&lt;/code&gt; to the chat function.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -r requirements_o2.6.txt
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Please refer to the following codes to run.&lt;/p&gt;
&lt;div align=&#34;center&#34;&gt;
&lt;img src=&#34;assets/minicpmo2_6/show_demo.jpg&#34; width=&#34;500px&#34;&gt;
&lt;/div&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;29
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;30
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;31
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;32
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;33
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;34
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;35
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;PIL&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;manual_seed&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;100&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;attn_implementation&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;sdpa&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;torch_dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# sdpa or flash_attention_2, no eager&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cuda&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/minicpmo2_6/show_demo.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;enable_thinking&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;False&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# If `enable_thinking=True`, the long-thinking mode is enabled.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# First round chat &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;What is the landform in the picture?&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;image&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;enable_thinking&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;enable_thinking&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Second round chat, pass history context of multi-turn conversation&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;({&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;role&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;assistant&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]})&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;({&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;role&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;user&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;What should I pay attention to when traveling here?&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]})&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;You will get the following output:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# round1&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;The landform in the picture is karst topography. Karst landscapes are characterized by distinctive, jagged limestone hills or mountains with steep, irregular peaks and deep valleys—exactly what you see here These unique formations result from the dissolution of soluble rocks like limestone over millions of years through water erosion.
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;This scene closely resembles the famous karst landscape of Guilin and Yangshuo in China’s Guangxi Province. The area features dramatic, pointed limestone peaks rising dramatically above serene rivers and lush green forests, creating a breathtaking and iconic natural beauty that attracts millions of visitors each year &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; its picturesque views.
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# round2&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;When traveling to a karst landscape like this, here are some important tips:
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;1. Wear comfortable shoes: The terrain can be uneven and hilly.
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;2. Bring water and snacks &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; energy during hikes or boat rides.
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;3. Protect yourself from the sun with sunscreen, hats, and sunglasses—especially since you’ll likely spend &lt;span class=&#34;nb&#34;&gt;time&lt;/span&gt; outdoors exploring scenic spots.
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;4. Respect &lt;span class=&#34;nb&#34;&gt;local&lt;/span&gt; customs and nature regulations by not littering or disturbing wildlife.
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;By following these guidelines, you&lt;span class=&#34;err&#34;&gt;&amp;#39;&lt;/span&gt;ll have a safe and enjoyable trip &lt;span class=&#34;k&#34;&gt;while&lt;/span&gt; appreciating the stunning natural beauty of places such as Guilin’s karst mountains.
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h4 id=&#34;chat-with-multiple-images&#34;&gt;Chat with Multiple Images
&lt;/h4&gt;&lt;details&gt;
&lt;summary&gt; Click to view Python code running MiniCPM-V-4_5 with multiple images input. &lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;PIL&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;attn_implementation&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;sdpa&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;torch_dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# sdpa or flash_attention_2, no eager&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cuda&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image1&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;image1.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image2&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;image2.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;Compare image 1 and image 2, tell me about the differences between image 1 and image 2.&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;image1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;image2&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h4 id=&#34;in-context-few-shot-learning&#34;&gt;In-context Few-shot Learning
&lt;/h4&gt;&lt;details&gt;
&lt;summary&gt; Click to view Python code running MiniCPM-V-4_5 with few-shot input. &lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;PIL&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;attn_implementation&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;sdpa&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;torch_dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# sdpa or flash_attention_2, no eager&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cuda&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;production date&amp;#34;&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image1&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;example1.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer1&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;2023.08.04&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image2&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;example2.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer2&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;2007.04.24&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image_test&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;test.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;image1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]},&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;assistant&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;image2&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]},&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;assistant&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer2&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;image_test&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h4 id=&#34;chat-with-video&#34;&gt;Chat with Video
&lt;/h4&gt;&lt;details&gt;
&lt;summary&gt; Click to view Python code running MiniCPM-V-4_5 by with video input and 3D-Resampler. &lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;29
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;30
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;31
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;32
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;33
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;34
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;35
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;36
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;37
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;38
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;39
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;40
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;41
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;42
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;43
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;44
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;45
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;46
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;47
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;48
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;49
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;50
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;51
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;52
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;53
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;54
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;55
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;56
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;57
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;58
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;59
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;60
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;61
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;62
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;63
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;64
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;65
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;66
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;67
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;68
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;69
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;70
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;71
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;72
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;73
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;74
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;75
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;76
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;77
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;78
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;79
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;80
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;81
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;82
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;83
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;84
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;85
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;86
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;87
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;88
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;89
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;90
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;91
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;92
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;93
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;94
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;95
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;96
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;## The 3d-resampler compresses multiple frames into 64 tokens by introducing temporal_ids. &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# To achieve this, you need to organize your video data into two corresponding sequences: &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;#   frames: List[Image]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;#   temporal_ids: List[List[Int]].&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;PIL&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;decord&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;VideoReader&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;cpu&lt;/span&gt;    &lt;span class=&#34;c1&#34;&gt;# pip install decord&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;scipy.spatial&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;cKDTree&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;numpy&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;as&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;np&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;math&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;attn_implementation&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;sdpa&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;torch_dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# sdpa or flash_attention_2, no eager&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cuda&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-V-4_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# or openbmb/MiniCPM-o-2_6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;MAX_NUM_FRAMES&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;180&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# Indicates the maximum number of frames received after the videos are packed. The actual maximum number of valid frames is MAX_NUM_FRAMES * MAX_NUM_PACKING.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;MAX_NUM_PACKING&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;3&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# indicates the maximum packing number of video frames. valid range: 1-6&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;TIME_SCALE&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;mf&#34;&gt;0.1&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;def&lt;/span&gt; &lt;span class=&#34;nf&#34;&gt;map_to_nearest_scale&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;values&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;scale&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tree&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;cKDTree&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;asarray&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;scale&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[:,&lt;/span&gt; &lt;span class=&#34;kc&#34;&gt;None&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;])&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;indices&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;tree&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;query&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;asarray&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;values&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[:,&lt;/span&gt; &lt;span class=&#34;kc&#34;&gt;None&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;])&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;return&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;asarray&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;scale&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;indices&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;def&lt;/span&gt; &lt;span class=&#34;nf&#34;&gt;group_array&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;arr&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;size&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;return&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;arr&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;size&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;range&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;arr&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;),&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;size&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;def&lt;/span&gt; &lt;span class=&#34;nf&#34;&gt;encode_video&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;choose_fps&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;force_packing&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;None&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;def&lt;/span&gt; &lt;span class=&#34;nf&#34;&gt;uniform_sample&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;l&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;n&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;gap&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;l&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;n&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;idxs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;int&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;gap&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;+&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;gap&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;2&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;range&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;n&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;k&#34;&gt;return&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;l&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;idxs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;vr&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;VideoReader&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;ctx&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cpu&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;vr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_avg_fps&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;vr&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;choose_fps&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;int&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;&amp;lt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;MAX_NUM_FRAMES&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;choose_frames&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;round&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;min&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;choose_fps&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;round&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;min&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;MAX_NUM_FRAMES&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;else&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;math&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ceil&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;choose_fps&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;MAX_NUM_FRAMES&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;&amp;lt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;MAX_NUM_PACKING&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;choose_frames&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;round&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;choose_fps&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;k&#34;&gt;else&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;choose_frames&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;round&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;MAX_NUM_FRAMES&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;MAX_NUM_PACKING&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;MAX_NUM_PACKING&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frame_idx&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;range&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;vr&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))]&lt;/span&gt;      
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frame_idx&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;  &lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;array&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;uniform_sample&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_idx&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;choose_frames&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;force_packing&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;min&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;force_packing&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;MAX_NUM_PACKING&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39; duration:&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;get video frames=&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_idx&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;, packing_nums=&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;vr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_batch&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_idx&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;asnumpy&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frame_idx_ts&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frame_idx&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;scale&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;arange&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;video_duration&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;TIME_SCALE&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frame_ts_id&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;map_to_nearest_scale&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_idx_ts&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;scale&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;TIME_SCALE&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frame_ts_id&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frame_ts_id&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;astype&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;int32&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;assert&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;==&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_ts_id&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;fromarray&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;v&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;astype&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;uint8&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;v&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;frame_ts_id_group&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;group_array&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_ts_id&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;packing_nums&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;return&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frame_ts_id_group&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;video_test.mp4&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;5&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# fps for video&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;force_packing&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;kc&#34;&gt;None&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# You can set force_packing to ensure that 3D packing is forcibly enabled; otherwise, encode_video will dynamically set the packing quantity based on the duration.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frame_ts_id_group&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;encode_video&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;force_packing&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;force_packing&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Describe the video&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;frames&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;+&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]},&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_image_id&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;False&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_slice_nums&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temporal_ids&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame_ts_id_group&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h4 id=&#34;speech-and-audio-mode&#34;&gt;Speech and Audio Mode
&lt;/h4&gt;&lt;p&gt;Model initialization&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;librosa&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-o-2_6&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;attn_implementation&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;sdpa&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;torch_dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# sdpa or flash_attention_2, no eager&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cuda&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-o-2_6&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;init_tts&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tts&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;float&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr/&gt;
&lt;h5 id=&#34;mimick&#34;&gt;Mimick &lt;!-- omit in toc --&gt;
&lt;/h5&gt;&lt;p&gt;&lt;code&gt;Mimick&lt;/code&gt; task reflects a model&amp;rsquo;s end-to-end speech modeling capability. The model takes audio input, and outputs an ASR transcription and subsequently reconstructs the original audio with high similarity. The higher the similarity between the reconstructed audio and the original audio, the stronger the model&amp;rsquo;s foundational capability in end-to-end speech modeling.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;mimick_prompt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Please repeat each user&amp;#39;s speech, including voice style and speech content.&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;audio_input&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/input_examples/Trump_WEF_2018_10s.mp3&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# load the audio to be mimicked&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# `./assets/input_examples/fast-pace.wav`, &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# `./assets/input_examples/chi-english-1.wav` &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# `./assets/input_examples/exciting-emotion.wav` &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# for different aspects of speech-centric features.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;mimick_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;audio_input&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;output_mimick.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# save the tts result to output_audio_path&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr/&gt;
&lt;h5 id=&#34;general-speech-conversation-with-configurable-voices&#34;&gt;General Speech Conversation with Configurable Voices &lt;!-- omit in toc --&gt;
&lt;/h5&gt;&lt;p&gt;A general usage scenario of &lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; is role-playing a specific character based on the audio prompt. It will mimic the voice of the character to some extent and act like the character in text, including language style. In this mode, &lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; sounds &lt;strong&gt;more natural and human-like&lt;/strong&gt;. Self-defined audio prompts can be used to customize the voice of the character in an end-to-end manner.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;29
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;30
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;31
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;32
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/input_examples/icl_20.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# load the reference audio&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;sys_prompt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mode&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;audio_roleplay&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;language&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;en&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# round one&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;xxx.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]]}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_roleplay_round_1.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# round two&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;history&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;({&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;assistant&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;})&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;xxx.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]]}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;history&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_roleplay_round_2.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr/&gt;
&lt;h5 id=&#34;speech-conversation-as-an-ai-assistant&#34;&gt;Speech Conversation as an AI Assistant &lt;!-- omit in toc --&gt;
&lt;/h5&gt;&lt;p&gt;An enhanced feature of &lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; is to act as an AI assistant, but only with limited choice of voices. In this mode, &lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; is &lt;strong&gt;less human-like and more like a voice assistant&lt;/strong&gt;. In this mode, the model is more instruction-following. For demo, you are suggested to use &lt;code&gt;assistant_female_voice&lt;/code&gt;, &lt;code&gt;assistant_male_voice&lt;/code&gt;, and &lt;code&gt;assistant_default_female_voice&lt;/code&gt;. Other voices may work but not as stable as the default voices.&lt;/p&gt;
&lt;p&gt;&lt;em&gt;Please note that, &lt;code&gt;assistant_female_voice&lt;/code&gt; and &lt;code&gt;assistant_male_voice&lt;/code&gt; are more stable but sounds like robots, while &lt;code&gt;assistant_default_female_voice&lt;/code&gt; is more human-alike but not stable, its voice often changes in multiple turns. We suggest you to try stable voices &lt;code&gt;assistant_female_voice&lt;/code&gt; and &lt;code&gt;assistant_male_voice&lt;/code&gt;.&lt;/em&gt;&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;29
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;30
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;31
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;32
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/input_examples/assistant_female_voice.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or use `./assets/input_examples/assistant_male_voice.wav`&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;sys_prompt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mode&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;audio_assistant&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;language&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;en&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;xxx.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]]}&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# load the user&amp;#39;s audio question&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# round one&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_assistant_round_1.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# round two&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;history&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;({&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;assistant&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;})&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;xxx.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]]}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;history&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_assistant_round_2.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr/&gt;
&lt;h5 id=&#34;instruction-to-speech&#34;&gt;Instruction-to-Speech &lt;!-- omit in toc --&gt;
&lt;/h5&gt;&lt;p&gt;&lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; can also do Instruction-to-Speech, aka &lt;strong&gt;Voice Creation&lt;/strong&gt;. You can describe a voice in detail, and the model will generate a voice that matches the description. For more Instruction-to-Speech sample instructions, you can refer to &lt;a class=&#34;link&#34; href=&#34;https://voxinstruct.github.io/VoxInstruct/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://voxinstruct.github.io/VoxInstruct/&lt;/a&gt;.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;instruction&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;Speak like a male charming superstar, radiating confidence and style in every word.&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;instruction&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_voice_creation.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr/&gt;
&lt;h5 id=&#34;voice-cloning&#34;&gt;Voice Cloning &lt;!-- omit in toc --&gt;
&lt;/h5&gt;&lt;p&gt;&lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; can also do zero-shot text-to-speech, aka &lt;strong&gt;Voice Cloning&lt;/strong&gt;. With this mode, model will act like a TTS model.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/input_examples/icl_20.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# load the reference audio&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;sys_prompt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ref_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mode&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;voice_cloning&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;language&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;en&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;text_prompt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;Please read the text below.&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;text_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content that you want to read&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;user_question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_voice_cloning.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;hr/&gt;
&lt;h5 id=&#34;addressing-various-audio-understanding-tasks&#34;&gt;Addressing Various Audio Understanding Tasks &lt;!-- omit in toc --&gt;
&lt;/h5&gt;&lt;p&gt;&lt;code&gt;MiniCPM-o-2.6&lt;/code&gt; can also be used to address various audio understanding tasks, such as ASR, speaker analysis, general audio captioning, and sound scene tagging.&lt;/p&gt;
&lt;p&gt;For audio-to-text tasks, you can use the following prompts:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;ASR with ZH(same as AST en2zh): &lt;code&gt;请仔细听这段音频片段，并将其内容逐字记录。&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;ASR with EN(same as AST zh2en): &lt;code&gt;Please listen to the audio snippet carefully and transcribe the content.&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;Speaker Analysis: &lt;code&gt;Based on the speaker&#39;s content, speculate on their gender, condition, age range, and health status.&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;General Audio Caption: &lt;code&gt;Summarize the main content of the audio.&lt;/code&gt;&lt;/li&gt;
&lt;li&gt;General Sound Scene Tagging: &lt;code&gt;Utilize one keyword to convey the audio&#39;s content or the associated scene.&lt;/code&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;task_prompt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;Please listen to the audio snippet carefully and transcribe the content.&amp;#34;&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;+&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;se&#34;&gt;\n&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# can change to other prompts.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;audio_input&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/input_examples/audio_understanding.mp3&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# load the audio to be captioned&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;task_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;audio_input&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;128&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.3&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;result_audio_understanding.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h4 id=&#34;multimodal-live-streaming&#34;&gt;Multimodal Live Streaming
&lt;/h4&gt;&lt;details&gt;
&lt;summary&gt; Click to view Python code running MiniCPM-o 2.6 with chat inference. &lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;29
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;30
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;31
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;32
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;33
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;34
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;35
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;36
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;37
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;38
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;39
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;40
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;41
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;42
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;43
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;44
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;45
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;46
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;47
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;48
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;49
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;50
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;51
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;52
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;53
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;54
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;55
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;56
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;57
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;58
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;59
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;60
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;61
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;62
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;63
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;64
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;65
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;66
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;67
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;68
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;69
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;70
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;71
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;72
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;73
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;74
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;75
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;math&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;numpy&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;as&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;np&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;PIL&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;moviepy.editor&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;VideoFileClip&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;tempfile&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;librosa&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;soundfile&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;as&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;sf&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;def&lt;/span&gt; &lt;span class=&#34;nf&#34;&gt;get_video_chunk_content&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;flatten&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;video&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;VideoFileClip&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;video_duration:&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;video&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;duration&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;with&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;tempfile&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;NamedTemporaryFile&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;suffix&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;.wav&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;delete&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;k&#34;&gt;as&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;temp_audio_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;temp_audio_file_path&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;temp_audio_file&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;name&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;video&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;write_audiofile&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;temp_audio_file_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;codec&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;pcm_s16le&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;fps&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;audio_np&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;librosa&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;load&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;temp_audio_file_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;16000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;mono&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;num_units&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;math&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;ceil&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;duration&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;c1&#34;&gt;# 1 frame + 1s audio chunk&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;range&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;num_units&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;frame&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;video&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_frame&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;image&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;fromarray&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;((&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;frame&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;astype&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;uint8&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;audio&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;audio_np&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;*&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sr&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;*&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;flatten&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;extend&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;([&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&amp;lt;unit&amp;gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;image&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;])&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;k&#34;&gt;else&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;([&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&amp;lt;unit&amp;gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;image&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;])&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;return&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-o-2_6&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;attn_implementation&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;sdpa&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;torch_dtype&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;torch&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;bfloat16&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cuda&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-o-2_6&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;init_tts&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# If you are using an older version of PyTorch, you might encounter this issue &amp;#34;weight_norm_fwd_first_dim_kernel&amp;#34; not implemented for &amp;#39;BFloat16&amp;#39;, Please convert the TTS to float32 type.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# model.tts.float()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# https://huggingface.co/openbmb/MiniCPM-o-2_6/blob/main/assets/Skiing.mp4&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;assets/Skiing.mp4&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;sys_msg&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;get_sys_prompt&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;mode&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;omni&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;language&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;en&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# if use voice clone prompt, please set ref_audio&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# ref_audio_path = &amp;#39;/path/to/ref_audio&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode=&amp;#39;omni&amp;#39;, language=&amp;#39;en&amp;#39;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;get_video_chunk_content&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msg&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;role&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;user&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sys_msg&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;msg&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# please set generate_audio=True and output_audio_path to save the tts result&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;output.wav&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.5&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_new_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;4096&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;omni_input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# please set omni_input=True when omni inference&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_tts_template&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;output_audio_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;max_slice_nums&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_image_id&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;False&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;return_dict&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt; Click to view Python code running MiniCPM-o 2.6 with streaming inference. &lt;/summary&gt;
&lt;p&gt;Note: The streaming inference has a slight performance degradation because the audio encoding is not global.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;29
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;30
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;31
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;32
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;33
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;34
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;35
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;36
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;37
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;38
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;39
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;40
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;41
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;42
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;43
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;44
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;45
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;46
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;47
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;48
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;49
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;50
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;51
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# a new conversation need reset session first, it will reset the kv-cache&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;reset_session&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;get_video_chunk_content&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;video_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;flatten&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;False&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;123&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# 1. prefill system prompt&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;streaming_prefill&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sys_msg&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;],&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# 2. prefill video/audio chunks&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;content&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;contents&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;role&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;user&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;content&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;content&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;streaming_prefill&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# 3. generate&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;streaming_generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;session_id&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;temperature&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mf&#34;&gt;0.5&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;audios&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;text&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;generate_audio&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;r&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;audio_wav&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;r&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;audio_wav&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;sampling_rate&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;r&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sampling_rate&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;txt&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;r&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;text&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;audios&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;append&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;audio_wav&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;text&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;+=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;txt&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;np&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;concatenate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;audios&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sf&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;write&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;output.wav&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;samplerate&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;sampling_rate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;text:&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;text&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;audio saved to output.wav&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;else&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;r&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;n&#34;&gt;text&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;+=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;r&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;text&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;text:&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;text&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h3 id=&#34;inference-on-multiple-gpus&#34;&gt;Inference on Multiple GPUs
&lt;/h3&gt;&lt;p&gt;You can run MiniCPM-Llama3-V 2.5 on multiple low VRAM GPUs (12 GB or 16 GB) by distributing the model&amp;rsquo;s layers across multiple GPUs. Please refer to this &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM-V/blob/main/docs/inference_on_multiple_gpus.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;tutorial&lt;/a&gt; for detailed instructions on how to load the model and inference using multiple low VRAM GPUs.&lt;/p&gt;
&lt;h3 id=&#34;inference-on-mac&#34;&gt;Inference on Mac
&lt;/h3&gt;&lt;details&gt;
&lt;summary&gt;Click to view an example, to run MiniCPM-Llama3-V 2.5 on 💻 Mac with MPS (Apple silicon or AMD GPUs). &lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# test.py  Need more than 16GB memory.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;torch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;PIL&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;transformers&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-Llama3-V-2_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;low_cpu_mem_usage&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;to&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;device&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;mps&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoTokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;from_pretrained&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;openbmb/MiniCPM-Llama3-V-2_5&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;trust_remote_code&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;eval&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;()&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;image&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;open&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;./assets/hk_OCR.jpg&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;convert&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;RGB&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;question&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;Where is this photo taken?&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[{&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;role&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;user&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s1&#34;&gt;&amp;#39;content&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;question&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;}]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;context&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;_&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;image&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;image&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;msgs&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;context&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;None&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;tokenizer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;sampling&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;answer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Run with command:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;PYTORCH_ENABLE_MPS_FALLBACK&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;1&lt;/span&gt; python test.py
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;h3 id=&#34;efficient-inference-with-llamacpp-ollama-vllm&#34;&gt;Efficient Inference with llama.cpp, Ollama, vLLM
&lt;/h3&gt;&lt;p&gt;See &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/llama.cpp/tree/minicpmv-main/examples/llava/README-minicpmv2.6.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;our fork of llama.cpp&lt;/a&gt; for more detail. This implementation supports smooth inference of 16~18 token/s on iPad (test environment：iPad Pro + M4).&lt;/p&gt;
&lt;p&gt;See &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/ollama/blob/minicpm-v2.6/examples/minicpm-v2.6/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;our fork of Ollama&lt;/a&gt; for more detail. This implementation supports smooth inference of 16~18 token/s on iPad (test environment：iPad Pro + M4).&lt;/p&gt;
&lt;details&gt;
&lt;summary&gt; vLLM now officially supports MiniCPM-V 2.6, MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.0. And you can use our fork to run MiniCPM-o 2.6 for now. Click to see. &lt;/summary&gt;
&lt;ol&gt;
&lt;li&gt;Install vLLM(&amp;gt;=0.7.1):&lt;/li&gt;
&lt;/ol&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install vllm
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ol start=&#34;2&#34;&gt;
&lt;li&gt;Run Example:&lt;/li&gt;
&lt;/ol&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.vllm.ai/en/latest/getting_started/examples/vision_language.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Vision Language&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.vllm.ai/en/latest/getting_started/examples/audio_language.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Audio Language&lt;/a&gt;
&lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;fine-tuning&#34;&gt;Fine-tuning
&lt;/h2&gt;&lt;h3 id=&#34;simple-fine-tuning&#34;&gt;Simple Fine-tuning &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;We support simple fine-tuning with Hugging Face for MiniCPM-o 2.6, MiniCPM-V 2.6, MiniCPM-Llama3-V 2.5 and MiniCPM-V 2.0.&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;./finetune/readme.md&#34; &gt;Reference Document&lt;/a&gt;&lt;/p&gt;
&lt;h3 id=&#34;with-align-anything&#34;&gt;With Align-Anything &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;We support fine-tuning MiniCPM-o 2.6 by PKU-Alignment Team (both vision and audio, SFT and DPO) with the &lt;a class=&#34;link&#34; href=&#34;https://github.com/PKU-Alignment/align-anything&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Align-Anything framework&lt;/a&gt;. Align-Anything is a scalable framework that aims to align any-modality large models with human intentions, open-sourcing the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/PKU-Alignment/align-anything&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;datasets, models and benchmarks&lt;/a&gt;. Benefiting from its concise and modular design, it supports 30+ open-source benchmarks, 40+ models and algorithms including SFT, SimPO, RLHF, &lt;em&gt;etc&lt;/em&gt;. It also provides 30+ directly runnable scripts, making it suitable for beginners to quickly get started.&lt;/p&gt;
&lt;p&gt;Best Practices: &lt;a class=&#34;link&#34; href=&#34;https://github.com/PKU-Alignment/align-anything/tree/main/scripts&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-o 2.6&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;with-llama-factory&#34;&gt;With LLaMA-Factory &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;We support fine-tuning MiniCPM-o 2.6 and MiniCPM-V 2.6 with the LLaMA-Factory framework. LLaMA-Factory provides a solution for flexibly customizing the fine-tuning (Lora/Full/Qlora) of 200+ LLMs without the need for coding through the built-in web UI LLaMABoard. It supports various training methods like sft/ppo/dpo/kto and advanced algorithms like Galore/BAdam/LLaMA-Pro/Pissa/LongLoRA.&lt;/p&gt;
&lt;p&gt;Best Practices: &lt;a class=&#34;link&#34; href=&#34;./docs/llamafactory_train_and_infer.md&#34; &gt;MiniCPM-o 2.6 | MiniCPM-V 2.6&lt;/a&gt;.&lt;/p&gt;
&lt;h3 id=&#34;with-the-swift-framework&#34;&gt;With the SWIFT Framework &lt;!-- omit in toc --&gt;
&lt;/h3&gt;&lt;p&gt;We now support MiniCPM-V series fine-tuning with the SWIFT framework. SWIFT supports training, inference, evaluation and deployment of nearly 200 LLMs and MLLMs . It supports the lightweight training solutions provided by PEFT and a complete Adapters Library including techniques such as NEFTune, LoRA+ and LLaMA-PRO.&lt;/p&gt;
&lt;p&gt;Best Practices：&lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v%e6%9c%80%e4%bd%b3%e5%ae%9e%e8%b7%b5.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V 1.0&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/swift/blob/main/docs/source/Multi-Modal/minicpm-v-2%e6%9c%80%e4%bd%b3%e5%ae%9e%e8%b7%b5.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V 2.0&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/ms-swift/issues/1613&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V 2.6&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;awesome-work-using-minicpm-v--minicpm-o&#34;&gt;Awesome work using MiniCPM-V &amp;amp; MiniCPM-o
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/CatchTheTornado/text-extract-api&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;text-extract-api&lt;/a&gt;: Document extraction API using OCRs and Ollama supported models &lt;img src=&#34;https://img.shields.io/github/stars/CatchTheTornado/text-extract-api&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/heshengtao/comfyui_LLM_party&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;comfyui_LLM_party&lt;/a&gt;: Build LLM workflows and integrate into existing image workflows &lt;img src=&#34;https://img.shields.io/github/stars/heshengtao/comfyui_LLM_party&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/imanoop7/Ollama-OCR&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama-OCR&lt;/a&gt;: OCR package uses vlms through Ollama to extract text from images and PDF &lt;img src=&#34;https://img.shields.io/github/stars/imanoop7/Ollama-OCR&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/MixLabPro/comfyui-mixlab-nodes&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;comfyui-mixlab-nodes&lt;/a&gt;: ComfyUI node suite supports Workflow-to-APP、GPT&amp;amp;3D and more &lt;img src=&#34;https://img.shields.io/github/stars/MixLabPro/comfyui-mixlab-nodes&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/HumanAIGC-Engineering/OpenAvatarChat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenAvatarChat&lt;/a&gt;: Interactive digital human conversation implementation on single PC &lt;img src=&#34;https://img.shields.io/github/stars/HumanAIGC-Engineering/OpenAvatarChat&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/arkohut/pensieve&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;pensieve&lt;/a&gt;: A privacy-focused passive recording project by recording screen content &lt;img src=&#34;https://img.shields.io/github/stars/arkohut/pensieve&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/icereed/paperless-gpt&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;paperless-gpt&lt;/a&gt;: Use LLMs to handle paperless-ngx, AI-powered titles, tags and OCR &lt;img src=&#34;https://img.shields.io/github/stars/icereed/paperless-gpt&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/kimjammer/Neuro&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Neuro&lt;/a&gt;: A recreation of Neuro-Sama, but running on local models on consumer hardware &lt;img src=&#34;https://img.shields.io/github/stars/kimjammer/Neuro&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;faqs&#34;&gt;FAQs
&lt;/h2&gt;&lt;p&gt;Click here to view the &lt;a class=&#34;link&#34; href=&#34;./docs/faqs.md&#34; &gt;FAQs&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;limitations&#34;&gt;Limitations
&lt;/h2&gt;&lt;p&gt;As an experimental trial, we find MiniCPM-o 2.6 has notable limitations worth further investigation and improvement.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Unstable speech output.&lt;/strong&gt; The speech generation can be flawed with noisy backgrounds and unmeaningful sounds.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Repeated response.&lt;/strong&gt; The model tends to repeat its response when encountering similar consecutive user queries.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;High-latency on Web Demo.&lt;/strong&gt; Users may experience unusual high-latency when using web demo hosted on overseas servers. We recommend deploying the demo locally or with good network connections.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;model-license&#34;&gt;Model License &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;This repository is released under the &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM/blob/main/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Apache-2.0&lt;/a&gt; License.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;The usage of MiniCPM-o/V model weights must strictly follow &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM Model License.md&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;The models and weights of MiniCPM are completely free for academic research. after filling out a &lt;a class=&#34;link&#34; href=&#34;https://modelbest.feishu.cn/share/base/form/shrcnpV5ZT9EJ6xYjh3Kx0J6v8g&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&amp;ldquo;questionnaire&amp;rdquo;&lt;/a&gt; for registration, are also available for free commercial use.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;statement&#34;&gt;Statement &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;p&gt;As MLLMs, MiniCPM-o/V models generate content by learning a large number of multimodal corpora, but they cannot comprehend, express personal opinions, or make value judgements. Anything generated by MiniCPM-o/V models does not represent the views and positions of the model developers&lt;/p&gt;
&lt;p&gt;We will not be liable for any problems arising from the use of MiniCPM-o/V models, including but not limited to data security issues, risk of public opinion, or any risks and problems arising from the misdirection, misuse, dissemination, or misuse of the model.&lt;/p&gt;
&lt;h2 id=&#34;institutions&#34;&gt;Institutions  &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;p&gt;This project is developed by the following institutions:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;img src=&#34;assets/thunlp.png&#34; width=&#34;28px&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://nlp.csai.tsinghua.edu.cn/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;THUNLP&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;img src=&#34;assets/modelbest.png&#34; width=&#34;28px&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://modelbest.cn/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ModelBest&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;-star-history&#34;&gt;🌟 Star History &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;table align=&#34;center&#34;&gt;
    &lt;p align=&#34;center&#34;&gt;
      &lt;img src=&#34;assets/star-history-25-09-02.png&#34;/&gt;
    &lt;/p&gt;
&lt;/table&gt;
&lt;!-- &lt;picture&gt;
  &lt;source
    media=&#34;(prefers-color-scheme: dark)&#34;
    srcset=&#34;
      https://api.star-history.com/svg?repos=OpenBMB/MiniCPM-o&amp;type=Date&amp;theme=dark
    &#34;
  /&gt;
  &lt;source
    media=&#34;(prefers-color-scheme: light)&#34;
    srcset=&#34;
      https://api.star-history.com/svg?repos=OpenBMB/MiniCPM-o&amp;type=Date
    &#34;
  /&gt;
  &lt;img
    alt=&#34;Star History Chart&#34;
    src=&#34;https://api.star-history.com/svg?repos=OpenBMB/MiniCPM-o&amp;type=Date&#34;
  /&gt;
&lt;/picture&gt; --&gt;
&lt;h2 id=&#34;key-techniques-and-other-multimodal-projects&#34;&gt;Key Techniques and Other Multimodal Projects &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;p&gt;👏 Welcome to explore key techniques of MiniCPM-o/V and other multimodal projects of our team:&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/VisCPM/tree/main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;VisCPM&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/RLPR&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLPR&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://github.com/RLHF-V/RLHF-V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLHF-V&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://github.com/thunlp/LLaVA-UHD&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA-UHD&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://github.com/RLHF-V/RLAIF-V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLAIF-V&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;citation&#34;&gt;Citation &lt;!-- omit in toc --&gt;
&lt;/h2&gt;&lt;p&gt;If you find our model/code/paper helpful, please consider citing our papers 📝 and staring us ⭐️！&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bib&#34; data-lang=&#34;bib&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@article&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;yao2024minicpm&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{MiniCPM-V: A GPT-4V Level MLLM on Your Phone}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Yao, Yuan and Yu, Tianyu and Zhang, Ao and Wang, Chongyi and Cui, Junbo and Zhu, Hongji and Cai, Tianchi and Li, Haoyu and Zhao, Weilin and He, Zhihui and others}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;journal&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{arXiv preprint arXiv:2408.01800}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2024}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;</description>
        </item>
        <item>
        <title>Hands-On-Large-Language-Models</title>
        <link>https://producthunt.programnotes.cn/en/p/hands-on-large-language-models/</link>
        <pubDate>Wed, 27 Aug 2025 15:29:45 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/hands-on-large-language-models/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1733939910552-7752db0c03d0?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTYyNzk2MTd8&amp;ixlib=rb-4.1.0" alt="Featured image of post Hands-On-Large-Language-Models" /&gt;&lt;h1 id=&#34;handsonllmhands-on-large-language-models&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/HandsOnLLM/Hands-On-Large-Language-Models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;HandsOnLLM/Hands-On-Large-Language-Models&lt;/a&gt;
&lt;/h1&gt;&lt;p&gt;﻿# Hands-On Large Language Models&lt;/p&gt;
&lt;p&gt;&lt;a href=&#34;https://www.linkedin.com/in/jalammar/&#34;&gt;&lt;img src=&#34;https://img.shields.io/badge/Follow%20Jay-blue.svg?logo=linkedin&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://www.linkedin.com/in/mgrootendorst/&#34;&gt;&lt;img src=&#34;https://img.shields.io/badge/Follow%20Maarten-blue.svg?logo=linkedin&#34;&gt;&lt;/a&gt;
&lt;a href=&#34;https://www.deeplearning.ai/short-courses/how-transformer-llms-work/?utm_campaign=handsonllm-launch&amp;utm_medium=partner&#34;&gt;&lt;img src=&#34;https://img.shields.io/badge/DeepLearning.AI%20Course-NEW!-&amp;labelColor=black&amp;color=red.svg?logo=data:image/svg%2bxml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAuMDAwMzY1MjgxIC0wLjAwMDE0MDE0MiAzMy4yOSAzMy4xNSI+Cgk8cGF0aCBkPSJNMTYuNjQzIDMzLjE0NWMtMy4yOTIgMC02LjUxLS45NzItOS4yNDYtMi43OTNhMTYuNTg4IDE2LjU4OCAwIDAxLTYuMTMtNy40MzhBMTYuNTA3IDE2LjUwNyAwIDAxLjMyIDEzLjM0YTE2LjU1IDE2LjU1IDAgMDE0LjU1NS04LjQ4NUExNi42NjUgMTYuNjY1IDAgMDExMy4zOTYuMzE4YTE2LjcxIDE2LjcxIDAgMDE5LjYxNi45NDQgMTYuNjI4IDE2LjYyOCAwIDAxNy40NyA2LjEwMyAxNi41MjIgMTYuNTIyIDAgMDEyLjgwNCA5LjIwN2MwIDQuMzk2LTEuNzUzIDguNjEtNC44NzQgMTEuNzE5YTE2LjY4IDE2LjY4IDAgMDEtMTEuNzY5IDQuODU0em0uMTI1LTYuNjI4YzYuOTA2IDAgMTIuNTE3LTUuNjk4IDEyLjUxNy0xMi43MyAwLTcuMDMtNS42MS0xMi43MjUtMTIuNTE3LTEyLjcyNS02LjkwNiAwLTEyLjUxNyA1LjY5OC0xMi41MTcgMTIuNzI1IDAgNy4wMjcgNS42MTEgMTIuNzMgMTIuNTE3IDEyLjczem0tLjEyNS0yLjkxOGMtNi4yODkgMC0xMS4zODYtNC45MjUtMTEuMzg2LTExLjAwMkM1LjI1NyA2LjUyIDEwLjM2IDEuNTkgMTYuNjQzIDEuNTljNi4yODQgMCAxMS4zODYgNC45MyAxMS4zODYgMTEuMDA3cy01LjA5NyAxMS4wMDItMTEuMzg2IDExLjAwMnptLS4yNDItNC41MDhjNC43NyAwIDguNjMzLTMuNjc5IDguNjMzLTguMjE4IDAtNC41MzgtMy44ODUtOC4yMjEtOC42MzMtOC4yMjEtNC43NDcgMC04LjYzMiAzLjY3OS04LjYzMiA4LjIyMSAwIDQuNTQzIDMuODg1IDguMjE4IDguNjMyIDguMjE4eiIgZmlsbD0iI0ZENEE2MSIvPgo8L3N2Zz4=&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Welcome! In this repository you will find the code for all examples throughout the book &lt;a class=&#34;link&#34; href=&#34;https://www.amazon.com/Hands-Large-Language-Models-Understanding/dp/1098150961&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hands-On Large Language Models&lt;/a&gt; written by &lt;a class=&#34;link&#34; href=&#34;https://www.linkedin.com/in/jalammar/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Jay Alammar&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://www.linkedin.com/in/mgrootendorst/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Maarten Grootendorst&lt;/a&gt; which we playfully dubbed: &lt;br&gt;&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;&lt;b&gt;&lt;i&gt;&#34;The Illustrated LLM Book&#34;&lt;/i&gt;&lt;/b&gt;&lt;/p&gt;
&lt;p&gt;Through the visually educational nature of this book and with &lt;strong&gt;almost 300 custom made figures&lt;/strong&gt;, learn the practical tools and concepts you need to use Large Language Models today!&lt;/p&gt;
&lt;p&gt;&lt;a href=&#34;https://www.amazon.com/Hands-Large-Language-Models-Understanding/dp/1098150961&#34;&gt;&lt;img src=&#34;images/book_cover.png&#34; width=&#34;50%&#34; &gt;&lt;/a&gt;&lt;/p&gt;
&lt;br&gt;
&lt;p&gt;The book is available on:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.amazon.com/Hands-Large-Language-Models-Understanding/dp/1098150961&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Amazon&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.shroffpublishers.com/books/computer-science/large-language-models/9789355425522/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Shroff Publishers (India)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.oreilly.com/library/view/hands-on-large-language/9781098150952/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;O&amp;rsquo;Reilly&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.amazon.com/Hands-Large-Language-Models-Alammar-ebook/dp/B0DGZ46G88/ref=tmm_kin_swatch_0?_encoding=UTF8&amp;amp;qid=&amp;amp;sr=&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Kindle&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.barnesandnoble.com/w/hands-on-large-language-models-jay-alammar/1145185960&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Barnes and Noble&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.goodreads.com/book/show/210408850-hands-on-large-language-models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Goodreads&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;table-of-contents&#34;&gt;Table of Contents
&lt;/h2&gt;&lt;p&gt;We advise to run all examples through Google Colab for the easiest setup. Google Colab allows you to use a T4 GPU with 16GB of VRAM for free. All examples were mainly built and tested using Google Colab, so it should be the most stable platform. However, any other cloud provider should work.&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Chapter&lt;/th&gt;
          &lt;th&gt;Notebook&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 1: Introduction to Language Models&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter01/Chapter%201%20-%20Introduction%20to%20Language%20Models.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 2: Tokens and Embeddings&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter02/Chapter%202%20-%20Tokens%20and%20Token%20Embeddings.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 3: Looking Inside Transformer LLMs&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter03/Chapter%203%20-%20Looking%20Inside%20LLMs.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 4: Text Classification&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter04/Chapter%204%20-%20Text%20Classification.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 5: Text Clustering and Topic Modeling&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter05/Chapter%205%20-%20Text%20Clustering%20and%20Topic%20Modeling.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 6: Prompt Engineering&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter06/Chapter%206%20-%20Prompt%20Engineering.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 7: Advanced Text Generation Techniques and Tools&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter07/Chapter%207%20-%20Advanced%20Text%20Generation%20Techniques%20and%20Tools.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 8: Semantic Search and Retrieval-Augmented Generation&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter08/Chapter%208%20-%20Semantic%20Search.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 9: Multimodal Large Language Models&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter09/Chapter%209%20-%20Multimodal%20Large%20Language%20Models.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 10: Creating Text Embedding Models&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter10/Chapter%2010%20-%20Creating%20Text%20Embedding%20Models.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 11: Fine-tuning Representation Models for Classification&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter11/Chapter%2011%20-%20Fine-Tuning%20BERT.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Chapter 12: Fine-tuning Generation Models&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/github/HandsOnLLM/Hands-On-Large-Language-Models/blob/main/chapter12/Chapter%2012%20-%20Fine-tuning%20Generation%20Models.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open In Colab&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
You can check the &lt;a class=&#34;link&#34; href=&#34;.setup/&#34; &gt;setup&lt;/a&gt; folder for a quick-start guide to install all packages locally and you can check the &lt;a class=&#34;link&#34; href=&#34;.setup/conda/&#34; &gt;conda&lt;/a&gt; folder for a complete guide on how to setup your environment, including conda and PyTorch installation.
Note that the depending on your OS, Python version, and dependencies your results might be slightly differ. However, they
should this be similar to the examples in the book.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h2 id=&#34;reviews&#34;&gt;Reviews
&lt;/h2&gt;&lt;blockquote&gt;
&lt;p&gt;&amp;ldquo;&lt;em&gt;Jay and Maarten have continued their tradition of providing beautifully illustrated and insightful descriptions of complex topics in their new book. Bolstered with working code, timelines, and references to key papers, their book is a valuable resource for anyone looking to understand the main techniques behind how Large Language Models are built.&lt;/em&gt;&amp;rdquo;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Andrew Ng&lt;/strong&gt; - founder of &lt;a class=&#34;link&#34; href=&#34;https://www.deeplearning.ai/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DeepLearning.AI&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;hr&gt;
&lt;blockquote&gt;
&lt;p&gt;&amp;ldquo;&lt;em&gt;This is an exceptional guide to the world of language models and their practical applications in industry. Its highly-visual coverage of generative, representational, and retrieval applications of language models empowers readers to quickly understand, use, and refine LLMs. Highly recommended!&lt;/em&gt;&amp;rdquo;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Nils Reimers&lt;/strong&gt; - Director of Machine Learning at Cohere | creator of &lt;a class=&#34;link&#34; href=&#34;https://github.com/UKPLab/sentence-transformers&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;sentence-transformers&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;hr&gt;
&lt;blockquote&gt;
&lt;p&gt;&amp;ldquo;&lt;em&gt;I can’t think of another book that is more important to read right now. On every single page, I learned something that is critical to success in this era of language models.&lt;/em&gt;&amp;rdquo;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Josh Starmer&lt;/strong&gt; - &lt;a class=&#34;link&#34; href=&#34;https://www.youtube.com/channel/UCtYLUTtgS3k1Fg4y5tAhLbw&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;StatQuest&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;hr&gt;
&lt;blockquote&gt;
&lt;p&gt;&amp;ldquo;&lt;em&gt;If you’re looking to get up to speed in everything regarding LLMs, look no further! In this wonderful book, Jay and Maarten will take you from zero to expert in the history and latest advances in large language models. With very intuitive explanations, great real-life examples, clear illustrations, and comprehensive code labs, this book lifts the curtain on the complexities of transformer models, tokenizers, semantic search, RAG, and many other cutting-edge technologies. A must read for anyone interested in the latest AI technology!&lt;/em&gt;&amp;rdquo;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Luis Serrano, PhD&lt;/strong&gt; - Founder and CEO of &lt;a class=&#34;link&#34; href=&#34;https://www.youtube.com/@SerranoAcademy&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Serrano Academy&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;hr&gt;
&lt;blockquote&gt;
&lt;p&gt;&amp;ldquo;&lt;em&gt;Hands-On Large Language Models brings clarity and practical examples to cut through the hype of AI. It provides a wealth of great diagrams and visual aids to supplement the clear explanations. The worked examples and code make concrete what other books leave abstract. The book starts with simple introductory beginnings, and steadily builds in scope. By the final chapters, you will be fine-tuning and building your own large language models with confidence.&lt;/em&gt;&amp;rdquo;&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;Leland McInnes&lt;/strong&gt; - Researcher at the Tutte Institute for Mathematics and Computing | creator of &lt;a class=&#34;link&#34; href=&#34;https://github.com/lmcinnes/umap&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;UMAP&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/scikit-learn-contrib/hdbscan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;HDBSCAN&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;hr&gt;
&lt;h2 id=&#34;bonus-content&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;bonus/&#34; &gt;Bonus content!&lt;/a&gt;
&lt;/h2&gt;&lt;p&gt;We attempted to put as much information into the book without it being overwhelming. However, even with a 400-page book there is still much to discover!&lt;/p&gt;
&lt;p&gt;We continue to create more guides that compliment the book and go more in-depth into new and &lt;a class=&#34;link&#34; href=&#34;%28bonus/%29&#34; &gt;exciting topics&lt;/a&gt;:&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-mamba-and-state&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;A Visual Guide to Mamba&lt;/a&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-quantization&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;A Visual Guide to Quantization&lt;/a&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://jalammar.github.io/illustrated-stable-diffusion/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;The Illustrated Stable Diffusion&lt;/a&gt;&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/images/mamba.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/images/quant.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/images/diffusion.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-mixture-of-experts&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;A Visual Guide to Mixture of Experts&lt;/a&gt;&lt;/strong&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://newsletter.maartengrootendorst.com/p/a-visual-guide-to-reasoning-llms&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;A Visual Guide to Reasoning LLMs&lt;/a&gt;&lt;/strong&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://newsletter.languagemodels.co/p/the-illustrated-deepseek-r1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;The Illustrated DeepSeek-R1&lt;/a&gt;&lt;/strong&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/images/moe.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/images/reasoning.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/images/deepseek.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
	
&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;citation&#34;&gt;Citation
&lt;/h2&gt;&lt;p&gt;Please consider citing the book if you consider it useful for your research:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;9
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-fallback&#34; data-lang=&#34;fallback&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;@book{hands-on-llms-book,
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  author       = {Jay Alammar and Maarten Grootendorst},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  title        = {Hands-On Large Language Models},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  publisher    = {O&amp;#39;Reilly},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  year         = {2024},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  isbn         = {978-1098150969},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  url          = {https://www.oreilly.com/library/view/hands-on-large-language/9781098150952/},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  github       = {https://github.com/HandsOnLLM/Hands-On-Large-Language-Models}
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;}
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;</description>
        </item>
        <item>
        <title>anthropic-cookbook</title>
        <link>https://producthunt.programnotes.cn/en/p/anthropic-cookbook/</link>
        <pubDate>Sat, 21 Jun 2025 15:28:31 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/anthropic-cookbook/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1681055543029-8398bcd49519?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTA0OTA4MDd8&amp;ixlib=rb-4.1.0" alt="Featured image of post anthropic-cookbook" /&gt;&lt;h1 id=&#34;anthropicsanthropic-cookbook&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;anthropics/anthropic-cookbook&lt;/a&gt;
&lt;/h1&gt;&lt;h1 id=&#34;anthropic-cookbook&#34;&gt;Anthropic Cookbook
&lt;/h1&gt;&lt;p&gt;The Anthropic Cookbook provides code and guides designed to help developers build with Claude, offering copy-able code snippets that you can easily integrate into your own projects.&lt;/p&gt;
&lt;h2 id=&#34;prerequisites&#34;&gt;Prerequisites
&lt;/h2&gt;&lt;p&gt;To make the most of the examples in this cookbook, you&amp;rsquo;ll need an Anthropic API key (sign up for free &lt;a class=&#34;link&#34; href=&#34;https://www.anthropic.com&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;).&lt;/p&gt;
&lt;p&gt;While the code examples are primarily written in Python, the concepts can be adapted to any programming language that supports interaction with the Anthropic API.&lt;/p&gt;
&lt;p&gt;If you&amp;rsquo;re new to working with the Anthropic API, we recommend starting with our &lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/courses/tree/master/anthropic_api_fundamentals&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Anthropic API Fundamentals course&lt;/a&gt; to get a solid foundation.&lt;/p&gt;
&lt;h2 id=&#34;explore-further&#34;&gt;Explore Further
&lt;/h2&gt;&lt;p&gt;Looking for more resources to enhance your experience with Claude and AI assistants? Check out these helpful links:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.anthropic.com/claude/docs/guide-to-anthropics-prompt-engineering-resources&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Anthropic developer documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://support.anthropic.com&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Anthropic support docs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.anthropic.com/discord&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Anthropic Discord community&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;contributing&#34;&gt;Contributing
&lt;/h2&gt;&lt;p&gt;The Anthropic Cookbook thrives on the contributions of the developer community. We value your input, whether it&amp;rsquo;s submitting an idea, fixing a typo, adding a new guide, or improving an existing one. By contributing, you help make this resource even more valuable for everyone.&lt;/p&gt;
&lt;p&gt;To avoid duplication of efforts, please review the existing issues and pull requests before contributing.&lt;/p&gt;
&lt;p&gt;If you have ideas for new examples or guides, share them on the &lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/issues&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;issues page&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;table-of-recipes&#34;&gt;Table of recipes
&lt;/h2&gt;&lt;h3 id=&#34;skills&#34;&gt;Skills
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/tree/main/skills/classification&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Classification&lt;/a&gt;: Explore techniques for text and data classification using Claude.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/tree/main/skills/retrieval_augmented_generation&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Retrieval Augmented Generation&lt;/a&gt;: Learn how to enhance Claude&amp;rsquo;s responses with external knowledge.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/tree/main/skills/summarization&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Summarization&lt;/a&gt;: Discover techniques for effective text summarization with Claude.&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;tool-use-and-integration&#34;&gt;Tool Use and Integration
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/tree/main/tool_use&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Tool use&lt;/a&gt;: Learn how to integrate Claude with external tools and functions to extend its capabilities.
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/tool_use/customer_service_agent.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Customer service agent&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/tool_use/calculator_tool.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Calculator integration&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/how_to_make_sql_queries.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SQL queries&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;third-party-integrations&#34;&gt;Third-Party Integrations
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/tree/main/third_party&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Retrieval augmented generation&lt;/a&gt;: Supplement Claude&amp;rsquo;s knowledge with external data sources.
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/third_party/Pinecone/rag_using_pinecone.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Vector databases (Pinecone)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/third_party/Wikipedia/wikipedia-search-cookbook.ipynb/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Wikipedia&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/read_web_pages_with_haiku.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Web pages&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/third_party/Brave/web_search_using_brave.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Internet search (Brave)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/third_party/VoyageAI/how_to_create_embeddings.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Embeddings with Voyage AI&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;multimodal-capabilities&#34;&gt;Multimodal Capabilities
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/tree/main/multimodal&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Vision with Claude&lt;/a&gt;:
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/getting_started_with_vision.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Getting started with images&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/best_practices_for_vision.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Best practices for vision&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/reading_charts_graphs_powerpoints.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Interpreting charts and graphs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/how_to_transcribe_text.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Extracting content from forms&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/illustrated_responses.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Generate images with Claude&lt;/a&gt;: Use Claude with Stable Diffusion for image generation.&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;advanced-techniques&#34;&gt;Advanced Techniques
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/using_sub_agents.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Sub-agents&lt;/a&gt;: Learn how to use Haiku as a sub-agent in combination with Opus.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/pdf_upload_summarization.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Upload PDFs to Claude&lt;/a&gt;: Parse and pass PDFs as text to Claude.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/building_evals.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Automated evaluations&lt;/a&gt;: Use Claude to automate the prompt evaluation process.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/how_to_enable_json_mode.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Enable JSON mode&lt;/a&gt;: Ensure consistent JSON output from Claude.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/building_moderation_filter.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Create a moderation filter&lt;/a&gt;: Use Claude to create a content moderation filter for your application.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/anthropics/anthropic-cookbook/blob/main/misc/prompt_caching.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Prompt caching&lt;/a&gt;: Learn techniques for efficient prompt caching with Claude.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;additional-resources&#34;&gt;Additional Resources
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/aws-samples/anthropic-on-aws&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Anthropic on AWS&lt;/a&gt;: Explore examples and solutions for using Claude on AWS infrastructure.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/aws-samples/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;AWS Samples&lt;/a&gt;: A collection of code samples from AWS which can be adapted for use with Claude. Note that some samples may require modification to work optimally with Claude.&lt;/li&gt;
&lt;/ul&gt;
</description>
        </item>
        <item>
        <title>LLaMA-Factory</title>
        <link>https://producthunt.programnotes.cn/en/p/llama-factory/</link>
        <pubDate>Tue, 27 May 2025 15:31:11 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/llama-factory/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1680153527310-1a70b47af6e9?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NDgzMzA5MjJ8&amp;ixlib=rb-4.1.0" alt="Featured image of post LLaMA-Factory" /&gt;&lt;h1 id=&#34;hiyougallama-factory&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;hiyouga/LLaMA-Factory&lt;/a&gt;
&lt;/h1&gt;&lt;p&gt;&lt;img src=&#34;https://producthunt.programnotes.cn/assets/logo.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;# LLaMA Factory&#34;
	
	
&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/stargazers&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/stars/hiyouga/LLaMA-Factory?style=social&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Repo stars&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/commits/main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/last-commit/hiyouga/LLaMA-Factory&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub last commit&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/graphs/contributors&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/contributors/hiyouga/LLaMA-Factory?color=orange&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub contributors&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://github.com/hiyouga/LLaMA-Factory/actions/workflows/tests.yml/badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub workflow&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://pypi.org/project/llamafactory/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/pypi/v/llamafactory&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;PyPI&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://scholar.google.com/scholar?cites=12620864006390196564&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/citation-476-green&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Citation&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/pulls&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/PRs-welcome-blue&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub pull request&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://twitter.com/llamafactory_ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/twitter/follow/llamafactory_ai&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Twitter&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://discord.gg/rKfvV9r9FK&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://dcbadge.vercel.app/api/server/rKfvV9r9FK?compact=true&amp;amp;style=flat&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Discord&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://gitcode.com/zhengyaowei/LLaMA-Factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://gitcode.com/zhengyaowei/LLaMA-Factory/star/badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitCode&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://colab.research.google.com/assets/colab-badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open in Colab&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://gallery.pai-ml.com/assets/open-in-dsw.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Open in DSW&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/hiyouga/LLaMA-Board&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/%f0%9f%a4%97-Open%20in%20Spaces-blue&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Spaces&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/studios/hiyouga/LLaMA-Board&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/ModelScope-Open%20in%20Studios-blue&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Studios&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/SageMaker-Open%20in%20AWS-blue&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;SageMaker&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h3 id=&#34;used-by-amazon-nvidia-aliyun-etc&#34;&gt;Used by &lt;a class=&#34;link&#34; href=&#34;https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Amazon&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://developer.nvidia.com/rtx/ai-toolkit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://help.aliyun.com/zh/pai/use-cases/fine-tune-a-llama-3-model-with-llama-factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Aliyun&lt;/a&gt;, etc.
&lt;/h3&gt;&lt;div align=&#34;center&#34; markdown=&#34;1&#34;&gt;
&lt;h3 id=&#34;supporters-&#34;&gt;Supporters ❤️
&lt;/h3&gt;&lt;a href=&#34;https://warp.dev/llama-factory&#34;&gt;
    &lt;img alt=&#34;Warp sponsorship&#34; width=&#34;400&#34; src=&#34;https://github.com/user-attachments/assets/ab8dd143-b0fd-4904-bdc5-dd7ecac94eae&#34;&gt;
&lt;/a&gt;
&lt;h4 id=&#34;warp-the-agentic-terminal-for-developers&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://warp.dev/llama-factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Warp, the agentic terminal for developers&lt;/a&gt;
&lt;/h4&gt;&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://warp.dev/llama-factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Available for MacOS, Linux, &amp;amp; Windows&lt;/a&gt;&lt;/p&gt;
&lt;hr&gt;
&lt;h3 id=&#34;easily-fine-tune-100-large-language-models-with-zero-code-cli-and-web-ui&#34;&gt;Easily fine-tune 100+ large language models with zero-code &lt;a class=&#34;link&#34; href=&#34;#quickstart&#34; &gt;CLI&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;#fine-tuning-with-llama-board-gui-powered-by-gradio&#34; &gt;Web UI&lt;/a&gt;
&lt;/h3&gt;&lt;p&gt;&lt;img src=&#34;https://trendshift.io/api/badge/repositories/4535&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;GitHub Trend&#34;
	
	
&gt;&lt;/p&gt;
&lt;/div&gt;
&lt;p&gt;👋 Join our &lt;a class=&#34;link&#34; href=&#34;assets/wechat.jpg&#34; &gt;WeChat&lt;/a&gt; or &lt;a class=&#34;link&#34; href=&#34;assets/wechat_npu.jpg&#34; &gt;NPU user group&lt;/a&gt;.&lt;/p&gt;
\[ English | [中文](README_zh.md) \]&lt;p&gt;&lt;strong&gt;Fine-tuning a large language model can be easy as&amp;hellip;&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/user-attachments/assets/3991a3a8-4276-4d30-9cab-4cb0c4b9b99e&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/user-attachments/assets/3991a3a8-4276-4d30-9cab-4cb0c4b9b99e&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Choose your path:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Documentation&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://llamafactory.readthedocs.io/en/latest/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://llamafactory.readthedocs.io/en/latest/&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Colab (free)&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Local machine&lt;/strong&gt;: Please refer to &lt;a class=&#34;link&#34; href=&#34;#getting-started&#34; &gt;usage&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;PAI-DSW (free trial)&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;blockquote&gt;
&lt;p&gt;[!NOTE]
Except for the above links, all other websites are unauthorized third-party websites. Please carefully use them.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h2 id=&#34;table-of-contents&#34;&gt;Table of Contents
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#features&#34; &gt;Features&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#blogs&#34; &gt;Blogs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#changelog&#34; &gt;Changelog&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#supported-models&#34; &gt;Supported Models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#supported-training-approaches&#34; &gt;Supported Training Approaches&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#provided-datasets&#34; &gt;Provided Datasets&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#requirement&#34; &gt;Requirement&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#getting-started&#34; &gt;Getting Started&lt;/a&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#installation&#34; &gt;Installation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#data-preparation&#34; &gt;Data Preparation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#quickstart&#34; &gt;Quickstart&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#fine-tuning-with-llama-board-gui-powered-by-gradio&#34; &gt;Fine-Tuning with LLaMA Board GUI&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#build-docker&#34; &gt;Build Docker&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#deploy-with-openai-style-api-and-vllm&#34; &gt;Deploy with OpenAI-style API and vLLM&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#download-from-modelscope-hub&#34; &gt;Download from ModelScope Hub&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#download-from-modelers-hub&#34; &gt;Download from Modelers Hub&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#use-wb-logger&#34; &gt;Use W&amp;amp;B Logger&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#use-swanlab-logger&#34; &gt;Use SwanLab Logger&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#projects-using-llama-factory&#34; &gt;Projects using LLaMA Factory&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#license&#34; &gt;License&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#citation&#34; &gt;Citation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#acknowledgement&#34; &gt;Acknowledgement&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;features&#34;&gt;Features
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;Various models&lt;/strong&gt;: LLaMA, LLaVA, Mistral, Mixtral-MoE, Qwen, Qwen2-VL, DeepSeek, Yi, Gemma, ChatGLM, Phi, etc.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Integrated methods&lt;/strong&gt;: (Continuous) pre-training, (multimodal) supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO, etc.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Scalable resources&lt;/strong&gt;: 16-bit full-tuning, freeze-tuning, LoRA and 2/3/4/5/6/8-bit QLoRA via AQLM/AWQ/GPTQ/LLM.int8/HQQ/EETQ.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Advanced algorithms&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://github.com/jiaweizzhao/GaLore&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GaLore&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/Ledzy/BAdam&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BAdam&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/zhuhanqing/APOLLO&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;APOLLO&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/zyushun/Adam-mini&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Adam-mini&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/KellerJordan/Muon&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Muon&lt;/a&gt;, DoRA, LongLoRA, LLaMA Pro, Mixture-of-Depths, LoRA+, LoftQ and PiSSA.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Practical tricks&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://github.com/Dao-AILab/flash-attention&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FlashAttention-2&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/unslothai/unsloth&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Unsloth&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/linkedin/Liger-Kernel&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Liger Kernel&lt;/a&gt;, RoPE scaling, NEFTune and rsLoRA.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Wide tasks&lt;/strong&gt;: Multi-turn dialogue, tool using, image understanding, visual grounding, video recognition, audio understanding, etc.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Experiment monitors&lt;/strong&gt;: LlamaBoard, TensorBoard, Wandb, MLflow, &lt;a class=&#34;link&#34; href=&#34;https://github.com/SwanHubX/SwanLab&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SwanLab&lt;/a&gt;, etc.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;Faster inference&lt;/strong&gt;: OpenAI-style API, Gradio UI and CLI with &lt;a class=&#34;link&#34; href=&#34;https://github.com/vllm-project/vllm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;vLLM worker&lt;/a&gt; or &lt;a class=&#34;link&#34; href=&#34;https://github.com/sgl-project/sglang&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SGLang worker&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;day-n-support-for-fine-tuning-cutting-edge-models&#34;&gt;Day-N Support for Fine-Tuning Cutting-Edge Models
&lt;/h3&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Support Date&lt;/th&gt;
          &lt;th&gt;Model Name&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;Day 0&lt;/td&gt;
          &lt;td&gt;Qwen3 / Qwen2.5-VL / Gemma 3 / InternLM 3 / MiniCPM-o-2.6&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Day 1&lt;/td&gt;
          &lt;td&gt;Llama 3 / GLM-4 / Mistral Small / PaliGemma2 / Llama 4&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;blogs&#34;&gt;Blogs
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://aws.amazon.com/cn/blogs/machine-learning/how-apoidea-group-enhances-visual-information-extraction-from-banking-documents-with-multimodal-models-using-llama-factory-on-amazon-sagemaker-hyperpod/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;How Apoidea Group enhances visual information extraction from banking documents with multimodal models using LLaMA-Factory on Amazon SageMaker HyperPod&lt;/a&gt; (English)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://buaa-act.feishu.cn/wiki/GVzlwYcRFiR8OLkHbL6cQpYin7g&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Easy Dataset × LLaMA Factory: Enabling LLMs to Efficiently Learn Domain Knowledge&lt;/a&gt; (English)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_deepseek_r1_distill_7b&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA Factory: Fine-tuning the DeepSeek-R1-Distill-Qwen-7B Model for News Classifier&lt;/a&gt; (Chinese)&lt;/li&gt;
&lt;/ul&gt;
&lt;details&gt;&lt;summary&gt;All Blogs&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://aws.amazon.com/cn/blogs/china/a-one-stop-code-free-model-fine-tuning-deployment-platform-based-on-sagemaker-and-llama-factory/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;A One-Stop Code-Free Model Fine-Tuning &amp;amp; Deployment Platform based on SageMaker and LLaMA-Factory&lt;/a&gt; (Chinese)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory_qwen2vl&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA Factory Multi-Modal Fine-Tuning Practice: Fine-Tuning Qwen2-VL for Personal Tourist Guide&lt;/a&gt; (Chinese)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://gallery.pai-ml.com/#/preview/deepLearning/nlp/llama_factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA Factory: Fine-tuning the LLaMA3 Model for Role-Playing&lt;/a&gt; (Chinese)&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;h2 id=&#34;changelog&#34;&gt;Changelog
&lt;/h2&gt;&lt;p&gt;[25/04/28] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://qwenlm.github.io/blog/qwen3/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen3&lt;/a&gt;&lt;/strong&gt; model family.&lt;/p&gt;
&lt;p&gt;[25/04/21] We supported the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/KellerJordan/Muon&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Muon&lt;/a&gt;&lt;/strong&gt; optimizer. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/tianshijing&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@tianshijing&lt;/a&gt;&amp;rsquo;s PR.&lt;/p&gt;
&lt;p&gt;[25/04/16] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/OpenGVLab/InternVL3-8B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;InternVL3&lt;/a&gt;&lt;/strong&gt; model. See &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/pull/7258&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PR #7258&lt;/a&gt; to get started.&lt;/p&gt;
&lt;p&gt;[25/04/14] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM/GLM-Z1-9B-0414&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLM-Z1&lt;/a&gt;&lt;/strong&gt; and &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Kimi-VL&lt;/a&gt;&lt;/strong&gt; models.&lt;/p&gt;
&lt;p&gt;[25/04/06] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://ai.meta.com/blog/llama-4-multimodal-intelligence/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 4&lt;/a&gt;&lt;/strong&gt; model. See &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/pull/7611&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PR #7611&lt;/a&gt; to get started.&lt;/p&gt;
&lt;details&gt;&lt;summary&gt;Full Changelog&lt;/summary&gt;
&lt;p&gt;[25/03/31] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://qwenlm.github.io/blog/qwen2.5-omni/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2.5 Omni&lt;/a&gt;&lt;/strong&gt; model. See &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/pull/7537&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PR #7537&lt;/a&gt; to get started.&lt;/p&gt;
&lt;p&gt;[25/03/15] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/sgl-project/sglang&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SGLang&lt;/a&gt;&lt;/strong&gt; as inference backend. Try &lt;code&gt;infer_backend: sglang&lt;/code&gt; to accelerate inference.&lt;/p&gt;
&lt;p&gt;[25/03/12] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/blog/gemma3&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gemma 3&lt;/a&gt;&lt;/strong&gt; model.&lt;/p&gt;
&lt;p&gt;[25/02/24] Announcing &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/EasyR1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;EasyR1&lt;/a&gt;&lt;/strong&gt;, an efficient, scalable and multi-modality RL training framework for efficient GRPO training.&lt;/p&gt;
&lt;p&gt;[25/02/11] We supported saving the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ollama/ollama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ollama&lt;/a&gt;&lt;/strong&gt; modelfile when exporting the model checkpoints. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[25/02/05] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;Qwen/Qwen2-Audio-7B-Instruct&#34; &gt;Qwen2-Audio&lt;/a&gt;&lt;/strong&gt; and &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-o-2.6&lt;/a&gt;&lt;/strong&gt; on audio understanding tasks.&lt;/p&gt;
&lt;p&gt;[25/01/31] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/deepseek-ai/DeepSeek-R1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DeepSeek-R1&lt;/a&gt;&lt;/strong&gt; and &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2.5-VL&lt;/a&gt;&lt;/strong&gt; models.&lt;/p&gt;
&lt;p&gt;[25/01/15] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2412.05270&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;APOLLO&lt;/a&gt;&lt;/strong&gt; optimizer. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[25/01/14] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-o-2_6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-o-2.6&lt;/a&gt;&lt;/strong&gt; and &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb/MiniCPM-V-2_6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-V-2.6&lt;/a&gt;&lt;/strong&gt; models. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/BUAADreamer&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@BUAADreamer&lt;/a&gt;&amp;rsquo;s PR.&lt;/p&gt;
&lt;p&gt;[25/01/14] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/internlm/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;InternLM 3&lt;/a&gt;&lt;/strong&gt; models. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/hhaAndroid&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@hhaAndroid&lt;/a&gt;&amp;rsquo;s PR.&lt;/p&gt;
&lt;p&gt;[25/01/10] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft/phi-4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-4&lt;/a&gt;&lt;/strong&gt; model.&lt;/p&gt;
&lt;p&gt;[24/12/21] We supported using &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/SwanHubX/SwanLab&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SwanLab&lt;/a&gt;&lt;/strong&gt; for experiment tracking and visualization. See &lt;a class=&#34;link&#34; href=&#34;#use-swanlab-logger&#34; &gt;this section&lt;/a&gt; for details.&lt;/p&gt;
&lt;p&gt;[24/11/27] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Skywork-o1&lt;/a&gt;&lt;/strong&gt; model and the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenO1&lt;/a&gt;&lt;/strong&gt; dataset.&lt;/p&gt;
&lt;p&gt;[24/10/09] We supported downloading pre-trained models and datasets from the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://modelers.cn/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Modelers Hub&lt;/a&gt;&lt;/strong&gt;. See &lt;a class=&#34;link&#34; href=&#34;#download-from-modelers-hub&#34; &gt;this tutorial&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/09/19] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://qwenlm.github.io/blog/qwen2.5/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2.5&lt;/a&gt;&lt;/strong&gt; models.&lt;/p&gt;
&lt;p&gt;[24/08/30] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://qwenlm.github.io/blog/qwen2-vl/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2-VL&lt;/a&gt;&lt;/strong&gt; models. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/simonJJJ&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@simonJJJ&lt;/a&gt;&amp;rsquo;s PR.&lt;/p&gt;
&lt;p&gt;[24/08/27] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/linkedin/Liger-Kernel&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Liger Kernel&lt;/a&gt;&lt;/strong&gt;. Try &lt;code&gt;enable_liger_kernel: true&lt;/code&gt; for efficient training.&lt;/p&gt;
&lt;p&gt;[24/08/09] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/zyushun/Adam-mini&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Adam-mini&lt;/a&gt;&lt;/strong&gt; optimizer. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/relic-yuexi&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@relic-yuexi&lt;/a&gt;&amp;rsquo;s PR.&lt;/p&gt;
&lt;p&gt;[24/07/04] We supported &lt;a class=&#34;link&#34; href=&#34;https://github.com/MeetKai/functionary/tree/main/functionary/train/packing&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;contamination-free packed training&lt;/a&gt;. Use &lt;code&gt;neat_packing: true&lt;/code&gt; to activate it. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/chuan298&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@chuan298&lt;/a&gt;&amp;rsquo;s PR.&lt;/p&gt;
&lt;p&gt;[24/06/16] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.02948&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PiSSA&lt;/a&gt;&lt;/strong&gt; algorithm. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/06/07] We supported fine-tuning the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://qwenlm.github.io/blog/qwen2/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2&lt;/a&gt;&lt;/strong&gt; and &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/THUDM/GLM-4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLM-4&lt;/a&gt;&lt;/strong&gt; models.&lt;/p&gt;
&lt;p&gt;[24/05/26] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.14734&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SimPO&lt;/a&gt;&lt;/strong&gt; algorithm for preference learning. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/05/20] We supported fine-tuning the &lt;strong&gt;PaliGemma&lt;/strong&gt; series models. Note that the PaliGemma models are pre-trained models, you need to fine-tune them with &lt;code&gt;paligemma&lt;/code&gt; template for chat completion.&lt;/p&gt;
&lt;p&gt;[24/05/18] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.01306&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;KTO&lt;/a&gt;&lt;/strong&gt; algorithm for preference learning. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/05/14] We supported training and inference on the Ascend NPU devices. Check &lt;a class=&#34;link&#34; href=&#34;#installation&#34; &gt;installation&lt;/a&gt; section for details.&lt;/p&gt;
&lt;p&gt;[24/04/26] We supported fine-tuning the &lt;strong&gt;LLaVA-1.5&lt;/strong&gt; multimodal LLMs. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/04/22] We provided a &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com/drive/1eRTPn37ltBbYsISy9Aw2NuI2Aq5CQrD9?usp=sharing&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Colab notebook&lt;/a&gt;&lt;/strong&gt; for fine-tuning the Llama-3 model on a free T4 GPU. Two Llama-3-derived models fine-tuned using LLaMA Factory are available at Hugging Face, check &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/shenzhi-wang/Llama3-8B-Chinese-Chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama3-8B-Chinese-Chat&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/zhichen/Llama3-Chinese&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama3-Chinese&lt;/a&gt; for details.&lt;/p&gt;
&lt;p&gt;[24/04/21] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.02258&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mixture-of-Depths&lt;/a&gt;&lt;/strong&gt; according to &lt;a class=&#34;link&#34; href=&#34;https://github.com/astramind-ai/Mixture-of-depths&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;AstraMindAI&amp;rsquo;s implementation&lt;/a&gt;. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/04/16] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.02827&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BAdam&lt;/a&gt;&lt;/strong&gt; optimizer. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/04/16] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/unslothai/unsloth&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;unsloth&lt;/a&gt;&lt;/strong&gt;&amp;rsquo;s long-sequence training (Llama-2-7B-56k within 24GB). It achieves &lt;strong&gt;117%&lt;/strong&gt; speed and &lt;strong&gt;50%&lt;/strong&gt; memory compared with FlashAttention-2, more benchmarks can be found in &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this page&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;[24/03/31] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.07691&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ORPO&lt;/a&gt;&lt;/strong&gt;. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/03/21] Our paper &amp;ldquo;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.13372&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models&lt;/a&gt;&amp;rdquo; is available at arXiv!&lt;/p&gt;
&lt;p&gt;[24/03/20] We supported &lt;strong&gt;FSDP+QLoRA&lt;/strong&gt; that fine-tunes a 70B model on 2x24GB GPUs. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/03/13] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.12354&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LoRA+&lt;/a&gt;&lt;/strong&gt;. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/03/07] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.03507&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GaLore&lt;/a&gt;&lt;/strong&gt; optimizer. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/03/07] We integrated &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/vllm-project/vllm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;vLLM&lt;/a&gt;&lt;/strong&gt; for faster and concurrent inference. Try &lt;code&gt;infer_backend: vllm&lt;/code&gt; to enjoy &lt;strong&gt;270%&lt;/strong&gt; inference speed.&lt;/p&gt;
&lt;p&gt;[24/02/28] We supported weight-decomposed LoRA (&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.09353&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DoRA&lt;/a&gt;&lt;/strong&gt;). Try &lt;code&gt;use_dora: true&lt;/code&gt; to activate DoRA training.&lt;/p&gt;
&lt;p&gt;[24/02/15] We supported &lt;strong&gt;block expansion&lt;/strong&gt; proposed by &lt;a class=&#34;link&#34; href=&#34;https://github.com/TencentARC/LLaMA-Pro&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA Pro&lt;/a&gt;. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[24/02/05] Qwen1.5 (Qwen2 beta version) series models are supported in LLaMA-Factory. Check this &lt;a class=&#34;link&#34; href=&#34;https://qwenlm.github.io/blog/qwen1.5/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;blog post&lt;/a&gt; for details.&lt;/p&gt;
&lt;p&gt;[24/01/18] We supported &lt;strong&gt;agent tuning&lt;/strong&gt; for most models, equipping model with tool using abilities by fine-tuning with &lt;code&gt;dataset: glaive_toolcall_en&lt;/code&gt;.&lt;/p&gt;
&lt;p&gt;[23/12/23] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/unslothai/unsloth&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;unsloth&lt;/a&gt;&lt;/strong&gt;&amp;rsquo;s implementation to boost LoRA tuning for the LLaMA, Mistral and Yi models. Try &lt;code&gt;use_unsloth: true&lt;/code&gt; argument to activate unsloth patch. It achieves &lt;strong&gt;170%&lt;/strong&gt; speed in our benchmark, check &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/wiki/Performance-comparison&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this page&lt;/a&gt; for details.&lt;/p&gt;
&lt;p&gt;[23/12/12] We supported fine-tuning the latest MoE model &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai/Mixtral-8x7B-v0.1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mixtral 8x7B&lt;/a&gt;&lt;/strong&gt; in our framework. See hardware requirement &lt;a class=&#34;link&#34; href=&#34;#hardware-requirement&#34; &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;[23/12/01] We supported downloading pre-trained models and datasets from the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ModelScope Hub&lt;/a&gt;&lt;/strong&gt;. See &lt;a class=&#34;link&#34; href=&#34;#download-from-modelscope-hub&#34; &gt;this tutorial&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[23/10/21] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2310.05914&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NEFTune&lt;/a&gt;&lt;/strong&gt; trick for fine-tuning. Try &lt;code&gt;neftune_noise_alpha: 5&lt;/code&gt; argument to activate NEFTune.&lt;/p&gt;
&lt;p&gt;[23/09/27] We supported &lt;strong&gt;$S^2$-Attn&lt;/strong&gt; proposed by &lt;a class=&#34;link&#34; href=&#34;https://github.com/dvlab-research/LongLoRA&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LongLoRA&lt;/a&gt; for the LLaMA models. Try &lt;code&gt;shift_attn: true&lt;/code&gt; argument to enable shift short attention.&lt;/p&gt;
&lt;p&gt;[23/09/23] We integrated MMLU, C-Eval and CMMLU benchmarks in this repo. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[23/09/10] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Dao-AILab/flash-attention&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FlashAttention-2&lt;/a&gt;&lt;/strong&gt;. Try &lt;code&gt;flash_attn: fa2&lt;/code&gt; argument to enable FlashAttention-2 if you are using RTX4090, A100 or H100 GPUs.&lt;/p&gt;
&lt;p&gt;[23/08/12] We supported &lt;strong&gt;RoPE scaling&lt;/strong&gt; to extend the context length of the LLaMA models. Try &lt;code&gt;rope_scaling: linear&lt;/code&gt; argument in training and &lt;code&gt;rope_scaling: dynamic&lt;/code&gt; argument at inference to extrapolate the position embeddings.&lt;/p&gt;
&lt;p&gt;[23/08/11] We supported &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2305.18290&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DPO training&lt;/a&gt;&lt;/strong&gt; for instruction-tuned models. See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;p&gt;[23/07/31] We supported &lt;strong&gt;dataset streaming&lt;/strong&gt;. Try &lt;code&gt;streaming: true&lt;/code&gt; and &lt;code&gt;max_steps: 10000&lt;/code&gt; arguments to load your dataset in streaming mode.&lt;/p&gt;
&lt;p&gt;[23/07/29] We released two instruction-tuned 13B models at Hugging Face. See these Hugging Face Repos (&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/hiyouga/Llama-2-Chinese-13b-chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA-2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/hiyouga/Baichuan-13B-sft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Baichuan&lt;/a&gt;) for details.&lt;/p&gt;
&lt;p&gt;[23/07/18] We developed an &lt;strong&gt;all-in-one Web UI&lt;/strong&gt; for training, evaluation and inference. Try &lt;code&gt;train_web.py&lt;/code&gt; to fine-tune models in your Web browser. Thank &lt;a class=&#34;link&#34; href=&#34;https://github.com/KanadeSiina&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@KanadeSiina&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/codemayq&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;@codemayq&lt;/a&gt; for their efforts in the development.&lt;/p&gt;
&lt;p&gt;[23/07/09] We released &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/FastEdit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FastEdit&lt;/a&gt;&lt;/strong&gt; ⚡🩹, an easy-to-use package for editing the factual knowledge of large language models efficiently. Please follow &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/FastEdit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FastEdit&lt;/a&gt; if you are interested.&lt;/p&gt;
&lt;p&gt;[23/06/29] We provided a &lt;strong&gt;reproducible example&lt;/strong&gt; of training a chat model using instruction-following datasets, see &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/hiyouga/Baichuan-7B-sft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Baichuan-7B-sft&lt;/a&gt; for details.&lt;/p&gt;
&lt;p&gt;[23/06/22] We aligned the &lt;a class=&#34;link&#34; href=&#34;src/api_demo.py&#34; &gt;demo API&lt;/a&gt; with the &lt;a class=&#34;link&#34; href=&#34;https://platform.openai.com/docs/api-reference/chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenAI&amp;rsquo;s&lt;/a&gt; format where you can insert the fine-tuned model in &lt;strong&gt;arbitrary ChatGPT-based applications&lt;/strong&gt;.&lt;/p&gt;
&lt;p&gt;[23/06/03] We supported quantized training and inference (aka &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/artidoro/qlora&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;QLoRA&lt;/a&gt;&lt;/strong&gt;). See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples&lt;/a&gt; for usage.&lt;/p&gt;
&lt;/details&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
If you cannot use the latest feature, please pull the latest code and install LLaMA-Factory again.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h2 id=&#34;supported-models&#34;&gt;Supported Models
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Model&lt;/th&gt;
          &lt;th&gt;Model size&lt;/th&gt;
          &lt;th&gt;Template&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/baichuan-inc&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Baichuan 2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/13B&lt;/td&gt;
          &lt;td&gt;baichuan2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/bigscience&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BLOOM/BLOOMZ&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;560M/1.1B/1.7B/3B/7.1B/176B&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ChatGLM3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;6B&lt;/td&gt;
          &lt;td&gt;chatglm3&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/CohereForAI&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Command R&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;35B/104B&lt;/td&gt;
          &lt;td&gt;cohere&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/deepseek-ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DeepSeek (Code/MoE)&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/16B/67B/236B&lt;/td&gt;
          &lt;td&gt;deepseek&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/deepseek-ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DeepSeek 2.5/3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;236B/671B&lt;/td&gt;
          &lt;td&gt;deepseek3&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/deepseek-ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DeepSeek R1 (Distill)&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1.5B/7B/8B/14B/32B/70B/671B&lt;/td&gt;
          &lt;td&gt;deepseekr1&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/tiiuae&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Falcon&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/11B/40B/180B&lt;/td&gt;
          &lt;td&gt;falcon&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/google&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gemma/Gemma 2/CodeGemma&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;2B/7B/9B/27B&lt;/td&gt;
          &lt;td&gt;gemma&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/google&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gemma 3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1B/4B/12B/27B&lt;/td&gt;
          &lt;td&gt;gemma3/gemma (1B)&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLM-4/GLM-4-0414/GLM-Z1&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;9B/32B&lt;/td&gt;
          &lt;td&gt;glm4/glmz1&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openai-community&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GPT-2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;0.1B/0.4B/0.8B/1.5B&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/ibm-granite&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Granite 3.0-3.3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1B/2B/3B/8B&lt;/td&gt;
          &lt;td&gt;granite3&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/tencent/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hunyuan&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B&lt;/td&gt;
          &lt;td&gt;hunyuan&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/IndexTeam&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Index&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1.9B&lt;/td&gt;
          &lt;td&gt;index&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/internlm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;InternLM 2-3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/8B/20B&lt;/td&gt;
          &lt;td&gt;intern2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/OpenGVLab&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;InternVL 2.5-3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1B/2B/8B/14B/38B/78B&lt;/td&gt;
          &lt;td&gt;intern_vl&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/moonshotai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Kimi-VL&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;16B&lt;/td&gt;
          &lt;td&gt;kimi_vl&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/facebookresearch/llama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/13B/33B/65B&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/meta-llama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/13B/70B&lt;/td&gt;
          &lt;td&gt;llama2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/meta-llama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 3-3.3&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1B/3B/8B/70B&lt;/td&gt;
          &lt;td&gt;llama3&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/meta-llama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 4&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;109B/402B&lt;/td&gt;
          &lt;td&gt;llama4&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/meta-llama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 3.2 Vision&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;11B/90B&lt;/td&gt;
          &lt;td&gt;mllama&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/llava-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA-1.5&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/13B&lt;/td&gt;
          &lt;td&gt;llava&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/llava-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA-NeXT&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/8B/13B/34B/72B/110B&lt;/td&gt;
          &lt;td&gt;llava_next&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/llava-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA-NeXT-Video&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/34B&lt;/td&gt;
          &lt;td&gt;llava_next_video&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/XiaomiMiMo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiMo&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B&lt;/td&gt;
          &lt;td&gt;mimo&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1B/2B/4B&lt;/td&gt;
          &lt;td&gt;cpm/cpm3&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/openbmb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM-o-2.6/MiniCPM-V-2.6&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;8B&lt;/td&gt;
          &lt;td&gt;minicpm_o/minicpm_v&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ministral/Mistral-Nemo&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;8B/12B&lt;/td&gt;
          &lt;td&gt;ministral&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mistral/Mixtral&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/8x7B/8x22B&lt;/td&gt;
          &lt;td&gt;mistral&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mistral Small&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;24B&lt;/td&gt;
          &lt;td&gt;mistral_small&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/allenai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OLMo&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1B/7B&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/google&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PaliGemma/PaliGemma2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;3B/10B/28B&lt;/td&gt;
          &lt;td&gt;paligemma&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-1.5/Phi-2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1.3B/2.7B&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-3/Phi-3.5&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;4B/14B&lt;/td&gt;
          &lt;td&gt;phi&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-3-small&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B&lt;/td&gt;
          &lt;td&gt;phi_small&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-4&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;14B&lt;/td&gt;
          &lt;td&gt;phi4&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Pixtral&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;12B&lt;/td&gt;
          &lt;td&gt;pixtral&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen (1-2.5) (Code/Math/MoE/QwQ)&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;0.5B/1.5B/3B/7B/14B/32B/72B/110B&lt;/td&gt;
          &lt;td&gt;qwen&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen3 (MoE)&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;0.6B/1.7B/4B/8B/14B/32B/235B&lt;/td&gt;
          &lt;td&gt;qwen3&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2-Audio&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B&lt;/td&gt;
          &lt;td&gt;qwen2_audio&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2.5-Omni&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;3B/7B&lt;/td&gt;
          &lt;td&gt;qwen2_omni&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2-VL/Qwen2.5-VL/QVQ&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;2B/3B/7B/32B/72B&lt;/td&gt;
          &lt;td&gt;qwen2_vl&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/ByteDance-Seed&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Seed Coder&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;8B&lt;/td&gt;
          &lt;td&gt;seed_coder&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Skywork&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Skywork o1&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;8B&lt;/td&gt;
          &lt;td&gt;skywork_o1&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/bigcode&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;StarCoder 2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;3B/7B/15B&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Tele-AI&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TeleChat2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;3B/7B/35B/115B&lt;/td&gt;
          &lt;td&gt;telechat2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/xverse&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;XVERSE&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;7B/13B/65B&lt;/td&gt;
          &lt;td&gt;xverse&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/01-ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yi/Yi-1.5 (Code)&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;1.5B/6B/9B/34B&lt;/td&gt;
          &lt;td&gt;yi&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/01-ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yi-VL&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;6B/34B&lt;/td&gt;
          &lt;td&gt;yi_vl&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/IEITYuan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yuan 2&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;2B/51B/102B&lt;/td&gt;
          &lt;td&gt;yuan&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;blockquote&gt;
&lt;p&gt;[!NOTE]
For the &amp;ldquo;base&amp;rdquo; models, the &lt;code&gt;template&lt;/code&gt; argument can be chosen from &lt;code&gt;default&lt;/code&gt;, &lt;code&gt;alpaca&lt;/code&gt;, &lt;code&gt;vicuna&lt;/code&gt; etc. But make sure to use the &lt;strong&gt;corresponding template&lt;/strong&gt; for the &amp;ldquo;instruct/chat&amp;rdquo; models.&lt;/p&gt;
&lt;p&gt;Remember to use the &lt;strong&gt;SAME&lt;/strong&gt; template in training and inference.&lt;/p&gt;
&lt;p&gt;*: You should install the &lt;code&gt;transformers&lt;/code&gt; from main branch and use &lt;code&gt;DISABLE_VERSION_CHECK=1&lt;/code&gt; to skip version check.&lt;/p&gt;
&lt;p&gt;**: You need to install a specific version of &lt;code&gt;transformers&lt;/code&gt; to use the corresponding model.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;Please refer to &lt;a class=&#34;link&#34; href=&#34;src/llamafactory/extras/constants.py&#34; &gt;constants.py&lt;/a&gt; for a full list of models we supported.&lt;/p&gt;
&lt;p&gt;You also can add a custom chat template to &lt;a class=&#34;link&#34; href=&#34;src/llamafactory/data/template.py&#34; &gt;template.py&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;supported-training-approaches&#34;&gt;Supported Training Approaches
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Approach&lt;/th&gt;
          &lt;th&gt;Full-tuning&lt;/th&gt;
          &lt;th&gt;Freeze-tuning&lt;/th&gt;
          &lt;th&gt;LoRA&lt;/th&gt;
          &lt;th&gt;QLoRA&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;Pre-Training&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Supervised Fine-Tuning&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Reward Modeling&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;PPO Training&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;DPO Training&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;KTO Training&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;ORPO Training&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;SimPO Training&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
          &lt;td&gt;:white_check_mark:&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
The implementation details of PPO can be found in &lt;a class=&#34;link&#34; href=&#34;https://newfacade.github.io/notes-on-reinforcement-learning/17-ppo-trl.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this blog&lt;/a&gt;.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h2 id=&#34;provided-datasets&#34;&gt;Provided Datasets
&lt;/h2&gt;&lt;details&gt;&lt;summary&gt;Pre-training datasets&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;data/wiki_demo.txt&#34; &gt;Wiki Demo (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/tiiuae/falcon-refinedweb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RefinedWeb (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/togethercomputer/RedPajama-Data-V2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RedPajama V2 (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/olm/olm-wikipedia-20221220&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Wikipedia (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/pleisto/wikipedia-cn-20230720-filtered&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Wikipedia (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/EleutherAI/pile&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Pile (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Skywork/SkyPile-150B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SkyPile (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/HuggingFaceFW/fineweb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FineWeb (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/HuggingFaceFW/fineweb-edu&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FineWeb-Edu (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/bigcode/the-stack&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;The Stack (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/bigcode/starcoderdata&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;StarCoder (en)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;&lt;summary&gt;Supervised fine-tuning datasets&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;data/identity.json&#34; &gt;Identity (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/tatsu-lab/stanford_alpaca&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Stanford Alpaca (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ymcui/Chinese-LLaMA-Alpaca-3&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Stanford Alpaca (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Alpaca GPT4 (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/glaiveai/glaive-function-calling-v2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Glaive Function Calling V2 (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/GAIR/lima&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LIMA (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/JosephusCheung/GuanacoDataset&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Guanaco Dataset (multilingual)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BelleGroup/train_2M_CN&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BELLE 2M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BelleGroup/train_1M_CN&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BELLE 1M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BelleGroup/train_0.5M_CN&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BELLE 0.5M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BELLE Dialogue 0.4M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BelleGroup/school_math_0.25M&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BELLE School Math 0.25M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BelleGroup/multiturn_chat_0.8M&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BELLE Multiturn Chat 0.8M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/thunlp/UltraChat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;UltraChat (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/garage-bAInd/Open-Platypus&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenPlatypus (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/sahil2801/CodeAlpaca-20k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;CodeAlpaca 20k (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/QingyiSi/Alpaca-CoT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Alpaca CoT (multilingual)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Open-Orca/OpenOrca&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenOrca (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Open-Orca/SlimOrca&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SlimOrca (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/TIGER-Lab/MathInstruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MathInstruct (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/YeungNLP/firefly-train-1.1M&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Firefly 1.1M (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/wiki_qa&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Wiki QA (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/suolyer/webqa&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Web QA (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/zxbsmk/webnovel_cn&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;WebNovel (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/berkeley-nest/Nectar&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Nectar (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/datasets/deepctrl/deepctrl-sft-data&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;deepctrl (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/HasturOfficial/adgen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Advertise Generating (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/totally-not-an-llm/sharegpt-hyperfiltered-3k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ShareGPT Hyperfiltered (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/shibing624/sharegpt_gpt4&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ShareGPT4 (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/HuggingFaceH4/ultrachat_200k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;UltraChat 200k (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/THUDM/AgentInstruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;AgentInstruct (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/lmsys/lmsys-chat-1m&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LMSYS Chat 1M (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/WizardLM/WizardLM_evol_instruct_V2_196k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Evol Instruct V2 (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/HuggingFaceTB/cosmopedia&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Cosmopedia (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/hfl/stem_zh_instruction&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;STEM (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/hfl/ruozhiba_gpt4_turbo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ruozhiba (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/m-a-p/neo_sft_phase2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Neo-sft (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Magpie-Align/Magpie-Pro-300K-Filtered&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Magpie-Pro-300K-Filtered (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/argilla/magpie-ultra-v0.1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Magpie-ultra-v0.1 (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/TIGER-Lab/WebInstructSub&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;WebInstructSub (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/O1-OPEN/OpenO1-SFT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenO1-SFT (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Open-Thoughts (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/open-r1/OpenR1-Math-220k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Open-R1-Math (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Chinese-DeepSeek-R1-Distill (zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/BUAADreamer/llava-en-zh-300k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA mixed (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/jugg1024/pokemon-gpt4o-captions&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Pokemon-gpt4o-captions (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/oasst_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Open Assistant (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/dolly-15k_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Dolly 15k (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/alpaca-gpt4_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Alpaca GPT4 (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/openschnabeltier_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenSchnabeltier (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/evol-instruct_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Evol Instruct (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/dolphin_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Dolphin (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/booksum_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Booksum (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/airoboros-3.0_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Airoboros (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/ultra-chat_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ultrachat (de)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;&lt;summary&gt;Preference datasets&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/hiyouga/DPO-En-Zh-20k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DPO mixed (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/HuggingFaceH4/ultrafeedback_binarized&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;UltraFeedback (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/m-a-p/COIG-P&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;COIG-P (en&amp;amp;zh)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/openbmb/RLHF-V-Dataset&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLHF-V (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Zhihui/VLFeedback&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;VLFeedback (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RLAIF-V (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Intel/orca_dpo_pairs&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Orca DPO Pairs (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/Anthropic/hh-rlhf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;HH-RLHF (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/berkeley-nest/Nectar&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Nectar (en)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/mayflowergmbh/intel_orca_dpo_pairs_de&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Orca DPO (de)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/datasets/argilla/kto-mix-15k&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;KTO mixed (en)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;p&gt;Some datasets require confirmation before using them, so we recommend logging in with your Hugging Face account using these commands.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install --upgrade huggingface_hub
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;huggingface-cli login
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;requirement&#34;&gt;Requirement
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Mandatory&lt;/th&gt;
          &lt;th&gt;Minimum&lt;/th&gt;
          &lt;th&gt;Recommend&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;python&lt;/td&gt;
          &lt;td&gt;3.9&lt;/td&gt;
          &lt;td&gt;3.10&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;torch&lt;/td&gt;
          &lt;td&gt;2.0.0&lt;/td&gt;
          &lt;td&gt;2.6.0&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;torchvision&lt;/td&gt;
          &lt;td&gt;0.15.0&lt;/td&gt;
          &lt;td&gt;0.21.0&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;transformers&lt;/td&gt;
          &lt;td&gt;4.45.0&lt;/td&gt;
          &lt;td&gt;4.50.0&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;datasets&lt;/td&gt;
          &lt;td&gt;2.16.0&lt;/td&gt;
          &lt;td&gt;3.2.0&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;accelerate&lt;/td&gt;
          &lt;td&gt;0.34.0&lt;/td&gt;
          &lt;td&gt;1.2.1&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;peft&lt;/td&gt;
          &lt;td&gt;0.14.0&lt;/td&gt;
          &lt;td&gt;0.15.1&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;trl&lt;/td&gt;
          &lt;td&gt;0.8.6&lt;/td&gt;
          &lt;td&gt;0.9.6&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Optional&lt;/th&gt;
          &lt;th&gt;Minimum&lt;/th&gt;
          &lt;th&gt;Recommend&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;CUDA&lt;/td&gt;
          &lt;td&gt;11.6&lt;/td&gt;
          &lt;td&gt;12.2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;deepspeed&lt;/td&gt;
          &lt;td&gt;0.10.0&lt;/td&gt;
          &lt;td&gt;0.16.4&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;bitsandbytes&lt;/td&gt;
          &lt;td&gt;0.39.0&lt;/td&gt;
          &lt;td&gt;0.43.1&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;vllm&lt;/td&gt;
          &lt;td&gt;0.4.3&lt;/td&gt;
          &lt;td&gt;0.8.2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;flash-attn&lt;/td&gt;
          &lt;td&gt;2.5.6&lt;/td&gt;
          &lt;td&gt;2.7.2&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h3 id=&#34;hardware-requirement&#34;&gt;Hardware Requirement
&lt;/h3&gt;&lt;p&gt;* &lt;em&gt;estimated&lt;/em&gt;&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Method&lt;/th&gt;
          &lt;th&gt;Bits&lt;/th&gt;
          &lt;th&gt;7B&lt;/th&gt;
          &lt;th&gt;14B&lt;/th&gt;
          &lt;th&gt;30B&lt;/th&gt;
          &lt;th&gt;70B&lt;/th&gt;
          &lt;th&gt;&lt;code&gt;x&lt;/code&gt;B&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;Full (&lt;code&gt;bf16&lt;/code&gt; or &lt;code&gt;fp16&lt;/code&gt;)&lt;/td&gt;
          &lt;td&gt;32&lt;/td&gt;
          &lt;td&gt;120GB&lt;/td&gt;
          &lt;td&gt;240GB&lt;/td&gt;
          &lt;td&gt;600GB&lt;/td&gt;
          &lt;td&gt;1200GB&lt;/td&gt;
          &lt;td&gt;&lt;code&gt;18x&lt;/code&gt;GB&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Full (&lt;code&gt;pure_bf16&lt;/code&gt;)&lt;/td&gt;
          &lt;td&gt;16&lt;/td&gt;
          &lt;td&gt;60GB&lt;/td&gt;
          &lt;td&gt;120GB&lt;/td&gt;
          &lt;td&gt;300GB&lt;/td&gt;
          &lt;td&gt;600GB&lt;/td&gt;
          &lt;td&gt;&lt;code&gt;8x&lt;/code&gt;GB&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Freeze/LoRA/GaLore/APOLLO/BAdam&lt;/td&gt;
          &lt;td&gt;16&lt;/td&gt;
          &lt;td&gt;16GB&lt;/td&gt;
          &lt;td&gt;32GB&lt;/td&gt;
          &lt;td&gt;64GB&lt;/td&gt;
          &lt;td&gt;160GB&lt;/td&gt;
          &lt;td&gt;&lt;code&gt;2x&lt;/code&gt;GB&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;QLoRA&lt;/td&gt;
          &lt;td&gt;8&lt;/td&gt;
          &lt;td&gt;10GB&lt;/td&gt;
          &lt;td&gt;20GB&lt;/td&gt;
          &lt;td&gt;40GB&lt;/td&gt;
          &lt;td&gt;80GB&lt;/td&gt;
          &lt;td&gt;&lt;code&gt;x&lt;/code&gt;GB&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;QLoRA&lt;/td&gt;
          &lt;td&gt;4&lt;/td&gt;
          &lt;td&gt;6GB&lt;/td&gt;
          &lt;td&gt;12GB&lt;/td&gt;
          &lt;td&gt;24GB&lt;/td&gt;
          &lt;td&gt;48GB&lt;/td&gt;
          &lt;td&gt;&lt;code&gt;x/2&lt;/code&gt;GB&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;QLoRA&lt;/td&gt;
          &lt;td&gt;2&lt;/td&gt;
          &lt;td&gt;4GB&lt;/td&gt;
          &lt;td&gt;8GB&lt;/td&gt;
          &lt;td&gt;16GB&lt;/td&gt;
          &lt;td&gt;24GB&lt;/td&gt;
          &lt;td&gt;&lt;code&gt;x/4&lt;/code&gt;GB&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;getting-started&#34;&gt;Getting Started
&lt;/h2&gt;&lt;h3 id=&#34;installation&#34;&gt;Installation
&lt;/h3&gt;&lt;blockquote&gt;
&lt;p&gt;[!IMPORTANT]
Installation is mandatory.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone --depth &lt;span class=&#34;m&#34;&gt;1&lt;/span&gt; https://github.com/hiyouga/LLaMA-Factory.git
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; LLaMA-Factory
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -e &lt;span class=&#34;s2&#34;&gt;&amp;#34;.[torch,metrics]&amp;#34;&lt;/span&gt; --no-build-isolation
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Extra dependencies available: torch, torch-npu, metrics, deepspeed, liger-kernel, bitsandbytes, hqq, eetq, gptq, aqlm, vllm, sglang, galore, apollo, badam, adam-mini, qwen, minicpm_v, modelscope, openmind, swanlab, quality&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
Use &lt;code&gt;pip install -e . --no-deps --no-build-isolation&lt;/code&gt; to resolve package conflicts.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;details&gt;&lt;summary&gt;Setting up a virtual environment with &lt;b&gt;uv&lt;/b&gt;&lt;/summary&gt;
&lt;p&gt;Create an isolated Python environment with &lt;a class=&#34;link&#34; href=&#34;https://github.com/astral-sh/uv&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;uv&lt;/a&gt;:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;uv sync --extra torch --extra metrics --prerelease&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;allow
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Run LLaMA-Factory in the isolated environment:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;uv run --prerelease&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;allow llamafactory-cli train examples/train_lora/llama3_lora_pretrain.yaml
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;details&gt;&lt;summary&gt;For Windows users&lt;/summary&gt;
&lt;h4 id=&#34;install-pytorch&#34;&gt;Install PyTorch
&lt;/h4&gt;&lt;p&gt;You need to manually install the GPU version of PyTorch on the Windows platform. Please refer to the &lt;a class=&#34;link&#34; href=&#34;https://pytorch.org/get-started/locally/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;official website&lt;/a&gt; and the following command to install PyTorch with CUDA support:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip uninstall torch torchvision torchaudio
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;python -c &lt;span class=&#34;s2&#34;&gt;&amp;#34;import torch; print(torch.cuda.is_available())&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;If you see &lt;code&gt;True&lt;/code&gt; then you have successfully installed PyTorch with CUDA support.&lt;/p&gt;
&lt;p&gt;Try &lt;code&gt;dataloader_num_workers: 0&lt;/code&gt; if you encounter &lt;code&gt;Can&#39;t pickle local object&lt;/code&gt; error.&lt;/p&gt;
&lt;h4 id=&#34;install-bitsandbytes&#34;&gt;Install BitsAndBytes
&lt;/h4&gt;&lt;p&gt;If you want to enable the quantized LoRA (QLoRA) on the Windows platform, you need to install a pre-built version of &lt;code&gt;bitsandbytes&lt;/code&gt; library, which supports CUDA 11.1 to 12.2, please select the appropriate &lt;a class=&#34;link&#34; href=&#34;https://github.com/jllllll/bitsandbytes-windows-webui/releases/tag/wheels&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;release version&lt;/a&gt; based on your CUDA version.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install https://github.com/jllllll/bitsandbytes-windows-webui/releases/download/wheels/bitsandbytes-0.41.2.post2-py3-none-win_amd64.whl
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h4 id=&#34;install-flash-attention-2&#34;&gt;Install Flash Attention-2
&lt;/h4&gt;&lt;p&gt;To enable FlashAttention-2 on the Windows platform, please use the script from &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/lldacing/flash-attention-windows-wheel&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;flash-attention-windows-wheel&lt;/a&gt; to compile and install it by yourself.&lt;/p&gt;
&lt;/details&gt;
&lt;details&gt;&lt;summary&gt;For Ascend NPU users&lt;/summary&gt;
&lt;p&gt;To install LLaMA Factory on Ascend NPU devices, please upgrade Python to version 3.10 or higher and specify extra dependencies: &lt;code&gt;pip install -e &amp;quot;.[torch-npu,metrics]&amp;quot;&lt;/code&gt;. Additionally, you need to install the &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://www.hiascend.com/developer/download/community/result?module=cann&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ascend CANN Toolkit and Kernels&lt;/a&gt;&lt;/strong&gt;. Please follow the &lt;a class=&#34;link&#34; href=&#34;https://www.hiascend.com/document/detail/en/CANNCommunityEdition/600alphaX/softwareinstall/instg/atlasdeploy_03_0031.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;installation tutorial&lt;/a&gt; or use the following commands:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# replace the url according to your CANN version and devices&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# install CANN Toolkit&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-toolkit_8.0.0.alpha002_linux-&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;$(&lt;/span&gt;uname -i&lt;span class=&#34;k&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;.run
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;bash Ascend-cann-toolkit_8.0.0.alpha002_linux-&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;$(&lt;/span&gt;uname -i&lt;span class=&#34;k&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;.run --install
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# install CANN Kernels&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;wget https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C20SPC702/Ascend-cann-kernels-910b_8.0.0.alpha002_linux-&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;$(&lt;/span&gt;uname -i&lt;span class=&#34;k&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;.run
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;bash Ascend-cann-kernels-910b_8.0.0.alpha002_linux-&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;$(&lt;/span&gt;uname -i&lt;span class=&#34;k&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;.run --install
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# set env variables&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;source&lt;/span&gt; /usr/local/Ascend/ascend-toolkit/set_env.sh
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Requirement&lt;/th&gt;
          &lt;th&gt;Minimum&lt;/th&gt;
          &lt;th&gt;Recommend&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;CANN&lt;/td&gt;
          &lt;td&gt;8.0.RC1&lt;/td&gt;
          &lt;td&gt;8.0.0.alpha002&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;torch&lt;/td&gt;
          &lt;td&gt;2.1.0&lt;/td&gt;
          &lt;td&gt;2.4.0&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;torch-npu&lt;/td&gt;
          &lt;td&gt;2.1.0&lt;/td&gt;
          &lt;td&gt;2.4.0.post2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;deepspeed&lt;/td&gt;
          &lt;td&gt;0.13.2&lt;/td&gt;
          &lt;td&gt;0.13.2&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;vllm-ascend&lt;/td&gt;
          &lt;td&gt;-&lt;/td&gt;
          &lt;td&gt;0.7.3&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;p&gt;Remember to use &lt;code&gt;ASCEND_RT_VISIBLE_DEVICES&lt;/code&gt; instead of &lt;code&gt;CUDA_VISIBLE_DEVICES&lt;/code&gt; to specify the device to use.&lt;/p&gt;
&lt;p&gt;If you cannot infer model on NPU devices, try setting &lt;code&gt;do_sample: false&lt;/code&gt; in the configurations.&lt;/p&gt;
&lt;p&gt;Download the pre-built Docker images: &lt;a class=&#34;link&#34; href=&#34;http://mirrors.cn-central-221.ovaijisuan.com/detail/130.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;32GB&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;http://mirrors.cn-central-221.ovaijisuan.com/detail/131.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;64GB&lt;/a&gt;&lt;/p&gt;
&lt;h4 id=&#34;install-bitsandbytes-1&#34;&gt;Install BitsAndBytes
&lt;/h4&gt;&lt;p&gt;To use QLoRA based on bitsandbytes on Ascend NPU, please follow these 3 steps:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Manually compile bitsandbytes: Refer to &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/docs/bitsandbytes/installation?backend=Ascend&amp;#43;NPU&amp;amp;platform=Ascend&amp;#43;NPU&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;the installation documentation&lt;/a&gt; for the NPU version of bitsandbytes to complete the compilation and installation. The compilation requires a cmake version of at least 3.22.1 and a g++ version of at least 12.x.&lt;/li&gt;
&lt;/ol&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Install bitsandbytes from source&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Clone bitsandbytes repo, Ascend NPU backend is currently enabled on multi-backend-refactor branch&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone -b multi-backend-refactor https://github.com/bitsandbytes-foundation/bitsandbytes.git
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; bitsandbytes/
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Install dependencies&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install -r requirements-dev.txt
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Install the dependencies for the compilation tools. Note that the commands for this step may vary depending on the operating system. The following are provided for reference&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;apt-get install -y build-essential cmake
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Compile &amp;amp; install  &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;cmake -DCOMPUTE_BACKEND&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;npu -S .
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;make
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install .
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ol start=&#34;2&#34;&gt;
&lt;li&gt;Install transformers from the main branch.&lt;/li&gt;
&lt;/ol&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone -b main https://github.com/huggingface/transformers.git
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; transformers
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install .
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ol start=&#34;3&#34;&gt;
&lt;li&gt;Set &lt;code&gt;double_quantization: false&lt;/code&gt; in the configuration. You can refer to the &lt;a class=&#34;link&#34; href=&#34;examples/train_qlora/llama3_lora_sft_bnb_npu.yaml&#34; &gt;example&lt;/a&gt;.&lt;/li&gt;
&lt;/ol&gt;
&lt;/details&gt;
&lt;h3 id=&#34;data-preparation&#34;&gt;Data Preparation
&lt;/h3&gt;&lt;p&gt;Please refer to &lt;a class=&#34;link&#34; href=&#34;data/README.md&#34; &gt;data/README.md&lt;/a&gt; for checking the details about the format of dataset files. You can use datasets on HuggingFace / ModelScope / Modelers hub, load the dataset in local disk, or specify a path to s3/gcs cloud storage.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;[!NOTE]
Please update &lt;code&gt;data/dataset_info.json&lt;/code&gt; to use your custom dataset.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;You can also use &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ConardLi/easy-dataset&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Easy Dataset&lt;/a&gt;&lt;/strong&gt; or &lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/open-sciencelab/GraphGen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GraphGen&lt;/a&gt;&lt;/strong&gt; to create synthetic data for fine-tuning.&lt;/p&gt;
&lt;h3 id=&#34;quickstart&#34;&gt;Quickstart
&lt;/h3&gt;&lt;p&gt;Use the following 3 commands to run LoRA &lt;strong&gt;fine-tuning&lt;/strong&gt;, &lt;strong&gt;inference&lt;/strong&gt; and &lt;strong&gt;merging&lt;/strong&gt; of the Llama3-8B-Instruct model, respectively.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llamafactory-cli train examples/train_lora/llama3_lora_sft.yaml
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llamafactory-cli chat examples/inference/llama3_lora_sft.yaml
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llamafactory-cli &lt;span class=&#34;nb&#34;&gt;export&lt;/span&gt; examples/merge_lora/llama3_lora_sft.yaml
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;See &lt;a class=&#34;link&#34; href=&#34;examples/README.md&#34; &gt;examples/README.md&lt;/a&gt; for advanced usage (including distributed training).&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
Use &lt;code&gt;llamafactory-cli help&lt;/code&gt; to show help information.&lt;/p&gt;
&lt;p&gt;Read &lt;a class=&#34;link&#34; href=&#34;https://github.com/hiyouga/LLaMA-Factory/issues/4614&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FAQs&lt;/a&gt; first if you encounter any problems.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3 id=&#34;fine-tuning-with-llama-board-gui-powered-by-gradio&#34;&gt;Fine-Tuning with LLaMA Board GUI (powered by &lt;a class=&#34;link&#34; href=&#34;https://github.com/gradio-app/gradio&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gradio&lt;/a&gt;)
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llamafactory-cli webui
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;build-docker&#34;&gt;Build Docker
&lt;/h3&gt;&lt;p&gt;For CUDA users:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; docker/docker-cuda/
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose up -d
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; llamafactory bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For Ascend NPU users:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; docker/docker-npu/
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose up -d
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; llamafactory bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For AMD ROCm users:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; docker/docker-rocm/
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose up -d
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker compose &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; llamafactory bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;details&gt;&lt;summary&gt;Build without Docker Compose&lt;/summary&gt;
&lt;p&gt;For CUDA users:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker build -f ./docker/docker-cuda/Dockerfile &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_BNB&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_VLLM&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_DEEPSPEED&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_FLASHATTN&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;PIP_INDEX&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;https://pypi.org/simple &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -t llamafactory:latest .
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run -dit --gpus&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;all &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./hf_cache:/root/.cache/huggingface &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./ms_cache:/root/.cache/modelscope &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./om_cache:/root/.cache/openmind &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./data:/app/data &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./output:/app/output &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -p 7860:7860 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -p 8000:8000 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --shm-size 16G &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --name llamafactory &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    llamafactory:latest
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; -it llamafactory bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For Ascend NPU users:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;27
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;28
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Choose docker image upon your environment&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker build -f ./docker/docker-npu/Dockerfile &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_DEEPSPEED&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;PIP_INDEX&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;https://pypi.org/simple &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -t llamafactory:latest .
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Change `device` upon your resources&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run -dit &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./hf_cache:/root/.cache/huggingface &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./ms_cache:/root/.cache/modelscope &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./om_cache:/root/.cache/openmind &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./data:/app/data &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./output:/app/output &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v /usr/local/dcmi:/usr/local/dcmi &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v /usr/local/Ascend/driver:/usr/local/Ascend/driver &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v /etc/ascend_install.info:/etc/ascend_install.info &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -p 7860:7860 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -p 8000:8000 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --device /dev/davinci0 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --device /dev/davinci_manager &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --device /dev/devmm_svm &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --device /dev/hisi_hdc &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --shm-size 16G &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --name llamafactory &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    llamafactory:latest
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; -it llamafactory bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;For AMD ROCm users:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker build -f ./docker/docker-rocm/Dockerfile &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_BNB&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_VLLM&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_DEEPSPEED&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;INSTALL_FLASHATTN&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --build-arg &lt;span class=&#34;nv&#34;&gt;PIP_INDEX&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;https://pypi.org/simple &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -t llamafactory:latest .
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run -dit &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./hf_cache:/root/.cache/huggingface &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./ms_cache:/root/.cache/modelscope &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./om_cache:/root/.cache/openmind &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./data:/app/data &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./output:/app/output &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -v ./saves:/app/saves &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -p 7860:7860 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    -p 8000:8000 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --device /dev/kfd &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --device /dev/dri &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --shm-size 16G &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    --name llamafactory &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    llamafactory:latest
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker &lt;span class=&#34;nb&#34;&gt;exec&lt;/span&gt; -it llamafactory bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;/details&gt;
&lt;details&gt;&lt;summary&gt;Details about volume&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;code&gt;hf_cache&lt;/code&gt;: Utilize Hugging Face cache on the host machine. Reassignable if a cache already exists in a different directory.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;ms_cache&lt;/code&gt;: Similar to Hugging Face cache but for ModelScope users.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;om_cache&lt;/code&gt;: Similar to Hugging Face cache but for Modelers users.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;data&lt;/code&gt;: Place datasets on this dir of the host machine so that they can be selected on LLaMA Board GUI.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;output&lt;/code&gt;: Set export dir to this location so that the merged result can be accessed directly on the host machine.&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;h3 id=&#34;deploy-with-openai-style-api-and-vllm&#34;&gt;Deploy with OpenAI-style API and vLLM
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nv&#34;&gt;API_PORT&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;8000&lt;/span&gt; llamafactory-cli api examples/inference/llama3.yaml &lt;span class=&#34;nv&#34;&gt;infer_backend&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;vllm &lt;span class=&#34;nv&#34;&gt;vllm_enforce_eager&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;true&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;blockquote&gt;
&lt;p&gt;[!TIP]
Visit &lt;a class=&#34;link&#34; href=&#34;https://platform.openai.com/docs/api-reference/chat/create&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;this page&lt;/a&gt; for API document.&lt;/p&gt;
&lt;p&gt;Examples: &lt;a class=&#34;link&#34; href=&#34;scripts/api_example/test_image.py&#34; &gt;Image understanding&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;scripts/api_example/test_toolcall.py&#34; &gt;Function calling&lt;/a&gt;&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3 id=&#34;download-from-modelscope-hub&#34;&gt;Download from ModelScope Hub
&lt;/h3&gt;&lt;p&gt;If you have trouble with downloading models and datasets from Hugging Face, you can use ModelScope.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;export&lt;/span&gt; &lt;span class=&#34;nv&#34;&gt;USE_MODELSCOPE_HUB&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;1&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# `set USE_MODELSCOPE_HUB=1` for Windows&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Train the model by specifying a model ID of the ModelScope Hub as the &lt;code&gt;model_name_or_path&lt;/code&gt;. You can find a full list of model IDs at &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ModelScope Hub&lt;/a&gt;, e.g., &lt;code&gt;LLM-Research/Meta-Llama-3-8B-Instruct&lt;/code&gt;.&lt;/p&gt;
&lt;h3 id=&#34;download-from-modelers-hub&#34;&gt;Download from Modelers Hub
&lt;/h3&gt;&lt;p&gt;You can also use Modelers Hub to download models and datasets.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;export&lt;/span&gt; &lt;span class=&#34;nv&#34;&gt;USE_OPENMIND_HUB&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;1&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# `set USE_OPENMIND_HUB=1` for Windows&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Train the model by specifying a model ID of the Modelers Hub as the &lt;code&gt;model_name_or_path&lt;/code&gt;. You can find a full list of model IDs at &lt;a class=&#34;link&#34; href=&#34;https://modelers.cn/models&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Modelers Hub&lt;/a&gt;, e.g., &lt;code&gt;TeleAI/TeleChat-7B-pt&lt;/code&gt;.&lt;/p&gt;
&lt;h3 id=&#34;use-wb-logger&#34;&gt;Use W&amp;amp;B Logger
&lt;/h3&gt;&lt;p&gt;To use &lt;a class=&#34;link&#34; href=&#34;https://wandb.ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Weights &amp;amp; Biases&lt;/a&gt; for logging experimental results, you need to add the following arguments to yaml files.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-yaml&#34; data-lang=&#34;yaml&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;report_to&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;l&#34;&gt;wandb&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;run_name&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;l&#34;&gt;test_run&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;c&#34;&gt;# optional&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Set &lt;code&gt;WANDB_API_KEY&lt;/code&gt; to &lt;a class=&#34;link&#34; href=&#34;https://wandb.ai/authorize&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;your key&lt;/a&gt; when launching training tasks to log in with your W&amp;amp;B account.&lt;/p&gt;
&lt;h3 id=&#34;use-swanlab-logger&#34;&gt;Use SwanLab Logger
&lt;/h3&gt;&lt;p&gt;To use &lt;a class=&#34;link&#34; href=&#34;https://github.com/SwanHubX/SwanLab&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SwanLab&lt;/a&gt; for logging experimental results, you need to add the following arguments to yaml files.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-yaml&#34; data-lang=&#34;yaml&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;use_swanlab&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;true&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nt&#34;&gt;swanlab_run_name&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;l&#34;&gt;test_run&lt;/span&gt;&lt;span class=&#34;w&#34;&gt; &lt;/span&gt;&lt;span class=&#34;c&#34;&gt;# optional&lt;/span&gt;&lt;span class=&#34;w&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;When launching training tasks, you can log in to SwanLab in three ways:&lt;/p&gt;
&lt;ol&gt;
&lt;li&gt;Add &lt;code&gt;swanlab_api_key=&amp;lt;your_api_key&amp;gt;&lt;/code&gt; to the yaml file, and set it to your &lt;a class=&#34;link&#34; href=&#34;https://swanlab.cn/settings&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;API key&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Set the environment variable &lt;code&gt;SWANLAB_API_KEY&lt;/code&gt; to your &lt;a class=&#34;link&#34; href=&#34;https://swanlab.cn/settings&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;API key&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;Use the &lt;code&gt;swanlab login&lt;/code&gt; command to complete the login.&lt;/li&gt;
&lt;/ol&gt;
&lt;h2 id=&#34;projects-using-llama-factory&#34;&gt;Projects using LLaMA Factory
&lt;/h2&gt;&lt;p&gt;If you have a project that should be incorporated, please contact via email or create a pull request.&lt;/p&gt;
&lt;details&gt;&lt;summary&gt;Click to show&lt;/summary&gt;
&lt;ol&gt;
&lt;li&gt;Wang et al. ESRL: Efficient Sampling-based Reinforcement Learning for Sequence Generation. 2023. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2308.02223&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yu et al. Open, Closed, or Small Language Models for Text Classification? 2023. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2308.10092&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Wang et al. UbiPhysio: Support Daily Functioning, Fitness, and Rehabilitation with Action Understanding and Feedback in Natural Language. 2023. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2308.10526&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Luceri et al. Leveraging Large Language Models to Detect Influence Campaigns in Social Media. 2023. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2311.07816&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. Alleviating Hallucinations of Large Language Models through Induced Hallucinations. 2023. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2312.15710&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Wang et al. Know Your Needs Better: Towards Structured Understanding of Marketer Demands with Analogical Reasoning Augmented LLMs. KDD 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2401.04319&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Wang et al. CANDLE: Iterative Conceptualization and Instantiation Distillation from Large Language Models for Commonsense Reasoning. ACL 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2401.07286&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Choi et al. FACT-GPT: Fact-Checking Augmentation via Claim Matching with LLMs. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.05904&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. AutoMathText: Autonomous Data Selection with Language Models for Mathematical Texts. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.07625&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Lyu et al. KnowTuning: Knowledge-aware Fine-tuning for Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.11176&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yang et al. LaCo: Large Language Model Pruning via Layer Collaps. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.11187&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Bhardwaj et al. Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned Language Models through Task Arithmetic. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.11746&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yang et al. Enhancing Empathetic Response Generation by Augmenting LLMs with Small-scale Empathetic Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.11801&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yi et al. Generation Meets Verification: Accelerating Large Language Model Inference with Smart Parallel Auto-Correct Decoding. ACL 2024 Findings. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.11809&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Cao et al. Head-wise Shareable Attention for Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.11819&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. Enhancing Multilingual Capabilities of Large Language Models through Self-Distillation from Resource-Rich Languages. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.12204&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Kim et al. Efficient and Effective Vocabulary Expansion Towards Multilingual Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.14714&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yu et al. KIEval: A Knowledge-grounded Interactive Evaluation Framework for Large Language Models. ACL 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2402.15043&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Huang et al. Key-Point-Driven Data Synthesis with its Enhancement on Mathematical Reasoning. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.02333&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Duan et al. Negating Negatives: Alignment without Human Positive Samples via Distributional Dispreference Optimization. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.03419&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Xie and Schwertfeger. Empowering Robotics with Large Language Models: osmAG Map Comprehension with LLMs. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.08228&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Wu et al. Large Language Models are Parallel Multilingual Learners. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.09073&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. EDT: Improving Large Language Models&amp;rsquo; Generation by Entropy-based Dynamic Temperature Sampling. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.14541&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Weller et al. FollowIR: Evaluating and Teaching Information Retrieval Models to Follow Instructions. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.15246&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Hongbin Na. CBT-LLM: A Chinese Large Language Model for Cognitive Behavioral Therapy-based Mental Health Question Answering. COLING 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.16008&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zan et al. CodeS: Natural Language to Code Repository via Multi-Layer Sketch. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2403.16443&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Liu et al. Extensive Self-Contrast Enables Feedback-Free Language Model Alignment. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.00604&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Luo et al. BAdam: A Memory Efficient Full Parameter Training Method for Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.02827&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Du et al. Chinese Tiny LLM: Pretraining a Chinese-Centric Large Language Model. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.04167&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Ma et al. Parameter Efficient Quasi-Orthogonal Fine-Tuning via Givens Rotation. ICML 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.04316&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Liu et al. Dynamic Generation of Personalities with Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.07084&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Shang et al. How Far Have We Gone in Stripped Binary Code Understanding Using Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.09836&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Huang et al. LLMTune: Accelerate Database Knob Tuning with Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.11581&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Deng et al. Text-Tuple-Table: Towards Information Integration in Text-to-Table Generation via Global Tuple Extraction. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.14215&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Acikgoz et al. Hippocrates: An Open-Source Framework for Advancing Large Language Models in Healthcare. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.16621&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. Small Language Models Need Strong Verifiers to Self-Correct Reasoning. ACL 2024 Findings. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.17140&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhou et al. FREB-TQA: A Fine-Grained Robustness Evaluation Benchmark for Table Question Answering. NAACL 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2404.18585&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Xu et al. Large Language Models for Cyber Security: A Systematic Literature Review. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.04760&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Dammu et al. &amp;ldquo;They are uncultured&amp;rdquo;: Unveiling Covert Harms and Social Threats in LLM Generated Conversations. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.05378&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yi et al. A safety realignment framework via subspace-oriented model fusion for large language models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.09055&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Lou et al. SPO: Multi-Dimensional Preference Sequential Alignment With Implicit Reward Modeling. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.12739&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. Getting More from Less: Large Language Models are Good Spontaneous Multilingual Learners. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.13816&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhang et al. TS-Align: A Teacher-Student Collaborative Framework for Scalable Iterative Finetuning of Large Language Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2405.20215&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zihong Chen. Sentence Segmentation and Sentence Punctuation Based on XunziALLM. 2024. &lt;a class=&#34;link&#34; href=&#34;https://aclanthology.org/2024.lt4hala-1.30&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Gao et al. The Best of Both Worlds: Toward an Honest and Helpful Large Language Model. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.00380&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Wang and Song. MARS: Benchmarking the Metaphysical Reasoning Abilities of Language Models with a Multi-task Evaluation Dataset. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.02106&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Hu et al. Computational Limits of Low-Rank Adaptation (LoRA) for Transformer-Based Models. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.03136&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Ge et al. Time Sensitive Knowledge Editing through Efficient Finetuning. ACL 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.04496&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Tan et al. Peer Review as A Multi-Turn and Long-Context Dialogue with Role-Based Interactions. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.05688&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Song et al. Turbo Sparse: Achieving LLM SOTA Performance with Minimal Activated Parameters. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.05955&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Gu et al. RWKV-CLIP: A Robust Vision-Language Representation Learner. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.06973&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Chen et al. Advancing Tool-Augmented Large Language Models: Integrating Insights from Errors in Inference Trees. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.07115&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhu et al. Are Large Language Models Good Statisticians?. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.07815&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Li et al. Know the Unknown: An Uncertainty-Sensitive Method for LLM Instruction Tuning. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.10099&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Ding et al. IntentionQA: A Benchmark for Evaluating Purchase Intention Comprehension Abilities of Language Models in E-commerce. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.10173&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;He et al. COMMUNITY-CROSS-INSTRUCT: Unsupervised Instruction Generation for Aligning Large Language Models to Online Communities. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.12074&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Lin et al. FVEL: Interactive Formal Verification Environment with Large Language Models via Theorem Proving. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.14408&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Treutlein et al. Connecting the Dots: LLMs can Infer and Verbalize Latent Structure from Disparate Training Data. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.14546&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Feng et al. SS-Bench: A Benchmark for Social Story Generation and Evaluation. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.15695&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Feng et al. Self-Constructed Context Decompilation with Fined-grained Alignment Enhancement. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.17233&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Liu et al. Large Language Models for Cuffless Blood Pressure Measurement From Wearable Biosignals. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.18069&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Iyer et al. Exploring Very Low-Resource Translation with LLMs: The University of Edinburgh&amp;rsquo;s Submission to AmericasNLP 2024 Translation Task. AmericasNLP 2024. &lt;a class=&#34;link&#34; href=&#34;https://aclanthology.org/2024.americasnlp-1.25&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Li et al. Calibrating LLMs with Preference Optimization on Thought Trees for Generating Rationale in Science Question Scoring. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2406.19949&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yang et al. Financial Knowledge Large Language Model. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.00365&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Lin et al. DogeRM: Equipping Reward Models with Domain Knowledge through Model Merging. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.01470&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Bako et al. Evaluating the Semantic Profiling Abilities of LLMs for Natural Language Utterances in Data Visualization. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.06129&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Huang et al. RoLoRA: Fine-tuning Rotated Outlier-free LLMs for Effective Weight-Activation Quantization. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.08044&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Jiang et al. LLM-Collaboration on Automatic Science Journalism for the General Audience. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.09756&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Inouye et al. Applied Auto-tuning on LoRA Hyperparameters. 2024. &lt;a class=&#34;link&#34; href=&#34;https://scholarcommons.scu.edu/cseng_senior/272/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Qi et al. Research on Tibetan Tourism Viewpoints information generation system based on LLM. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.13561&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Xu et al. Course-Correction: Safety Alignment Using Synthetic Preferences. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.16637&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Sun et al. LAMBDA: A Large Model Based Data Agent. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.17535&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zhu et al. CollectiveSFT: Scaling Large Language Models for Chinese Medical Benchmark with Collective Instructions in Healthcare. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2407.19705&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Yu et al. Correcting Negative Bias in Large Language Models through Negative Attention Score Alignment. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2408.00137&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Xie et al. The Power of Personalized Datasets: Advancing Chinese Composition Writing for Elementary School through Targeted Model Fine-Tuning. IALP 2024. &lt;a class=&#34;link&#34; href=&#34;https://www.asianlp.sg/conferences/ialp2024/proceedings/papers/IALP2024_P055.pdf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Liu et al. Instruct-Code-Llama: Improving Capabilities of Language Model in Competition Level Code Generation by Online Judge Feedback. ICIC 2024. &lt;a class=&#34;link&#34; href=&#34;https://link.springer.com/chapter/10.1007/978-981-97-5669-8_11&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Wang et al. Cybernetic Sentinels: Unveiling the Impact of Safety Data Selection on Model Security in Supervised Fine-Tuning. ICIC 2024. &lt;a class=&#34;link&#34; href=&#34;https://link.springer.com/chapter/10.1007/978-981-97-5669-8_23&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Xia et al. Understanding the Performance and Estimating the Cost of LLM Fine-Tuning. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2408.04693&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zeng et al. Perceive, Reflect, and Plan: Designing LLM Agent for Goal-Directed City Navigation without Instructions. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2408.04168&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Xia et al. Using Pre-trained Language Model for Accurate ESG Prediction. FinNLP 2024. &lt;a class=&#34;link&#34; href=&#34;https://aclanthology.org/2024.finnlp-2.1/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Liang et al. I-SHEEP: Self-Alignment of LLM from Scratch through an Iterative Self-Enhancement Paradigm. 2024. &lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2408.08072&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[arxiv]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Bai et al. Aligning Large Language Model with Direct Multi-Preference Optimization for Recommendation. CIKM 2024. &lt;a class=&#34;link&#34; href=&#34;https://dl.acm.org/doi/10.1145/3627673.3679611&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[paper]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Yu-Yang-Li/StarWhisper&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;StarWhisper&lt;/a&gt;&lt;/strong&gt;: A large language model for Astronomy, based on ChatGLM2-6B and Qwen-14B.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/FudanDISC/DISC-LawLLM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DISC-LawLLM&lt;/a&gt;&lt;/strong&gt;: A large language model specialized in Chinese legal domain, based on Baichuan-13B, is capable of retrieving and reasoning on legal knowledge.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/X-D-Lab/Sunsimiao&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Sunsimiao&lt;/a&gt;&lt;/strong&gt;: A large language model specialized in Chinese medical domain, based on Baichuan-7B and ChatGLM-6B.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/WangRongsheng/CareGPT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;CareGPT&lt;/a&gt;&lt;/strong&gt;: A series of large language models for Chinese medical domain, based on LLaMA2-7B and Baichuan-13B.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/PKU-YuanGroup/Machine-Mindset/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MachineMindset&lt;/a&gt;&lt;/strong&gt;: A series of MBTI Personality large language models, capable of giving any LLM 16 different personality types based on different datasets and training methods.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Nekochu/Luminia-13B-v3&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Luminia-13B-v3&lt;/a&gt;&lt;/strong&gt;: A large language model specialized in generate metadata for stable diffusion. &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/Nekochu/Luminia-13B_SD_Prompt&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[demo]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/BUAADreamer/Chinese-LLaVA-Med&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Chinese-LLaVA-Med&lt;/a&gt;&lt;/strong&gt;: A multimodal large language model specialized in Chinese medical domain, based on LLaVA-1.5-7B.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/THUDM/AutoRE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;AutoRE&lt;/a&gt;&lt;/strong&gt;: A document-level relation extraction system based on large language models.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/RTX-AI-Toolkit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA RTX AI Toolkit&lt;/a&gt;&lt;/strong&gt;: SDKs for fine-tuning LLMs on Windows PC for NVIDIA RTX.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/LazyAGI/LazyLLM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LazyLLM&lt;/a&gt;&lt;/strong&gt;: An easy and lazy way for building multi-agent LLMs applications and supports model fine-tuning via LLaMA Factory.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/NLPJCL/RAG-Retrieval&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RAG-Retrieval&lt;/a&gt;&lt;/strong&gt;: A full pipeline for RAG retrieval model fine-tuning, inference, and distillation. &lt;a class=&#34;link&#34; href=&#34;https://zhuanlan.zhihu.com/p/987727357&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[blog]&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Qihoo360/360-LLaMA-Factory&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;360-LLaMA-Factory&lt;/a&gt;&lt;/strong&gt;: A modified library that supports long sequence SFT &amp;amp; DPO using ring attention.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://novasky-ai.github.io/posts/sky-t1/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Sky-T1&lt;/a&gt;&lt;/strong&gt;: An o1-like model fine-tuned by NovaSky AI with very small cost.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/xming521/WeClone&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;WeClone&lt;/a&gt;&lt;/strong&gt;: One-stop solution for creating your digital avatar from chat logs.&lt;/li&gt;
&lt;/ol&gt;
&lt;/details&gt;
&lt;h2 id=&#34;license&#34;&gt;License
&lt;/h2&gt;&lt;p&gt;This repository is licensed under the &lt;a class=&#34;link&#34; href=&#34;LICENSE&#34; &gt;Apache-2.0 License&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;Please follow the model licenses to use the corresponding model weights: &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/baichuan-inc/Baichuan2-7B-Base/blob/main/Community%20License%20for%20Baichuan%202%20Model.pdf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Baichuan 2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/bigscience/license&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BLOOM&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/THUDM/ChatGLM3/blob/main/MODEL_LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ChatGLM3&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://cohere.com/c4ai-cc-by-nc-license&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Command R&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/deepseek-ai/DeepSeek-LLM/blob/main/LICENSE-MODEL&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DeepSeek&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/tiiuae/falcon-180B/blob/main/LICENSE.txt&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Falcon&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://ai.google.dev/gemma/terms&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gemma&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM/glm-4-9b/blob/main/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLM-4&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/openai/gpt-2/blob/master/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GPT-2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;LICENSE&#34; &gt;Granite&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/IndexTeam/Index-1.9B/blob/main/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Index&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/InternLM/InternLM#license&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;InternLM&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://ai.meta.com/llama/license/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://llama.meta.com/llama3/license/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 3&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama4/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama 4&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/OpenBMB/MiniCPM/blob/main/MiniCPM%20Model%20License.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MiniCPM&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;LICENSE&#34; &gt;Mistral/Mixtral/Pixtral&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;LICENSE&#34; &gt;OLMo&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft/phi-1_5/resolve/main/Research%20License.docx&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-1.5/Phi-2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/microsoft/Phi-3-mini-4k-instruct/blob/main/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi-3/Phi-4&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/QwenLM/Qwen/blob/main/Tongyi%20Qianwen%20LICENSE%20AGREEMENT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Skywork/Skywork-13B-base/blob/main/Skywork%20Community%20License.pdf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Skywork&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;StarCoder 2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Tele-AI/telechat-7B/blob/main/TeleChat%E6%A8%A1%E5%9E%8B%E7%A4%BE%E5%8C%BA%E8%AE%B8%E5%8F%AF%E5%8D%8F%E8%AE%AE.pdf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TeleChat2&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/xverse-ai/XVERSE-13B/blob/main/MODEL_LICENSE.pdf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;XVERSE&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/01-ai/Yi-6B/blob/main/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yi&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;LICENSE&#34; &gt;Yi-1.5&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/IEIT-Yuan/Yuan-2.0/blob/main/LICENSE-Yuan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yuan 2&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;citation&#34;&gt;Citation
&lt;/h2&gt;&lt;p&gt;If this work is helpful, please kindly cite as:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;9
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bibtex&#34; data-lang=&#34;bibtex&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@inproceedings&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;zheng2024llamafactory&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;booktitle&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;address&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Bangkok, Thailand}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;publisher&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Association for Computational Linguistics}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2024}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;url&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{http://arxiv.org/abs/2403.13372}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;acknowledgement&#34;&gt;Acknowledgement
&lt;/h2&gt;&lt;p&gt;This repo benefits from &lt;a class=&#34;link&#34; href=&#34;https://github.com/huggingface/peft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PEFT&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/huggingface/trl&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;TRL&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/artidoro/qlora&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;QLoRA&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/lm-sys/FastChat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FastChat&lt;/a&gt;. Thanks for their wonderful works.&lt;/p&gt;
&lt;h2 id=&#34;star-history&#34;&gt;Star History
&lt;/h2&gt;&lt;p&gt;&lt;img src=&#34;https://api.star-history.com/svg?repos=hiyouga/LLaMA-Factory&amp;amp;type=Date&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Star History Chart&#34;
	
	
&gt;&lt;/p&gt;
</description>
        </item>
        <item>
        <title>llama-cookbook</title>
        <link>https://producthunt.programnotes.cn/en/p/llama-cookbook/</link>
        <pubDate>Wed, 09 Apr 2025 15:29:20 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/llama-cookbook/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1516503424803-708327384b90?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NDQxODM2NTd8&amp;ixlib=rb-4.0.3" alt="Featured image of post llama-cookbook" /&gt;&lt;h1 id=&#34;meta-llamallama-cookbook&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;meta-llama/llama-cookbook&lt;/a&gt;
&lt;/h1&gt;&lt;h1 id=&#34;llama-cookbook-the-official-guide-to-building-with-llama-models&#34;&gt;Llama Cookbook: The Official Guide to building with Llama Models
&lt;/h1&gt;&lt;p&gt;Checkout our latest model tutorial here: &lt;a class=&#34;link&#34; href=&#34;./getting-started/build_with_llama_4.ipynb&#34; &gt;Build with Llama 4 Scout&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;Welcome to the official repository for helping you get started with &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/inference/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;inference&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/finetuning&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;fine-tuning&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/end-to-end-use-cases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;end-to-end use-cases&lt;/a&gt; of building with the Llama Model family.&lt;/p&gt;
&lt;p&gt;This repository covers the most popular community approaches, use-cases and the latest recipes for Llama Text and Vision models.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
Popular getting started links:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/build_with_llama_4.ipynb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Build with Llama 4 Scout&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/inference/local_inference/README.md#multimodal-inference&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Multimodal Inference with Llama 3.2 Vision&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/responsible_ai/llama_guard/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Inferencing using Llama Guard (Safety Model)&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/blockquote&gt;
&lt;blockquote&gt;
&lt;p&gt;[!TIP]
Popular end to end recipes:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/end-to-end-use-cases/email_agent/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Email Agent&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/end-to-end-use-cases/NotebookLlama/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NotebookLlama&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/end-to-end-use-cases/coding/text2sql/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Text to SQL&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/blockquote&gt;
&lt;blockquote&gt;
&lt;p&gt;Note: We recently did a refactor of the repo, &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/archive-main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;archive-main&lt;/a&gt; is a snapshot branch from before the refactor&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h2 id=&#34;repository-structure&#34;&gt;Repository Structure:
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/3p-integrations&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;3P Integrations&lt;/a&gt;: Getting Started Recipes and End to End Use-Cases from various Llama providers&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/end-to-end-use-cases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;End to End Use Cases&lt;/a&gt;: As the name suggests, spanning various domains and applications&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/getting-started/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Getting Started&lt;/a&gt;: Reference for inferencing, fine-tuning and RAG examples&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/src/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;src&lt;/a&gt;: Contains the src for the original llama-recipes library along with some FAQs for fine-tuning.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;faq&#34;&gt;FAQ:
&lt;/h2&gt;&lt;h2 id=&#34;faq-1&#34;&gt;FAQ:
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Q:&lt;/strong&gt; What happened to llama-recipes?
&lt;strong&gt;A:&lt;/strong&gt; We recently renamed llama-recipes to llama-cookbook.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Q:&lt;/strong&gt; Prompt Template changes for Multi-Modality?
&lt;strong&gt;A:&lt;/strong&gt; Llama 3.2 follows the same prompt template as Llama 3.1, with a new special token &lt;code&gt;&amp;lt;|image|&amp;gt;&lt;/code&gt; representing the input image for the multimodal models. More details on the prompt templates for image reasoning, tool-calling, and code interpreter can be found &lt;a class=&#34;link&#34; href=&#34;https://llama.meta.com/docs/model-cards-and-prompt-formats/llama3_2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;on the documentation website&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Q:&lt;/strong&gt; I have some questions for Fine-Tuning, is there a section to address these?
&lt;strong&gt;A:&lt;/strong&gt; Checkout the Fine-Tuning FAQ &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/main/src/docs/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Q:&lt;/strong&gt; Some links are broken/folders are missing:
&lt;strong&gt;A:&lt;/strong&gt; We recently did a refactor of the repo, &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-cookbook/tree/archive-main&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;archive-main&lt;/a&gt; is a snapshot branch from before the refactor.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Q:&lt;/strong&gt; Where can we find details about the latest models?
&lt;strong&gt;A:&lt;/strong&gt; Official &lt;a class=&#34;link&#34; href=&#34;https://www.llama.com&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama models website&lt;/a&gt;.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;contributing&#34;&gt;Contributing
&lt;/h2&gt;&lt;p&gt;Please read &lt;a class=&#34;link&#34; href=&#34;CONTRIBUTING.md&#34; &gt;CONTRIBUTING.md&lt;/a&gt; for details on our code of conduct, and the process for submitting pull requests to us.&lt;/p&gt;
&lt;h2 id=&#34;license&#34;&gt;License
&lt;/h2&gt;&lt;!-- markdown-link-check-disable --&gt;
&lt;p&gt;See the License file for Meta Llama 3.2 &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt; and Acceptable Use Policy &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/USE_POLICY.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;See the License file for Meta Llama 3.1 &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt; and Acceptable Use Policy &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/USE_POLICY.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;See the License file for Meta Llama 3 &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama3/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt; and Acceptable Use Policy &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama3/USE_POLICY.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;See the License file for Meta Llama 2 &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama2/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt; and Acceptable Use Policy &lt;a class=&#34;link&#34; href=&#34;https://github.com/meta-llama/llama-models/blob/main/models/llama2/USE_POLICY.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;&lt;/p&gt;
&lt;!-- markdown-link-check-enable --&gt;
</description>
        </item>
        
    </channel>
</rss>
