<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>GGUF on Producthunt daily</title>
        <link>https://producthunt.programnotes.cn/en/tags/gguf/</link>
        <description>Recent content in GGUF on Producthunt daily</description>
        <generator>Hugo -- gohugo.io</generator>
        <language>en</language>
        <lastBuildDate>Tue, 14 Oct 2025 15:29:21 +0800</lastBuildDate><atom:link href="https://producthunt.programnotes.cn/en/tags/gguf/index.xml" rel="self" type="application/rss+xml" /><item>
        <title>llama.cpp</title>
        <link>https://producthunt.programnotes.cn/en/p/llama.cpp/</link>
        <pubDate>Tue, 14 Oct 2025 15:29:21 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/llama.cpp/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1641738876363-a0728bf25a8d?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NjA0MjY4ODR8&amp;ixlib=rb-4.1.0" alt="Featured image of post llama.cpp" /&gt;&lt;h1 id=&#34;ggml-orgllamacpp&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ggml-org/llama.cpp&lt;/a&gt;
&lt;/h1&gt;&lt;h1 id=&#34;llamacpp&#34;&gt;llama.cpp
&lt;/h1&gt;&lt;p&gt;&lt;img src=&#34;https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;llama&#34;
	
	
&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://opensource.org/licenses/MIT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/license-MIT-blue.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;License: MIT&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/releases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/github/v/release/ggml-org/llama.cpp&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Release&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Server&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/205&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Manifesto&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/ggml&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ggml&lt;/a&gt; / &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/blob/master/docs/ops.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ops&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;LLM inference in C/C++&lt;/p&gt;
&lt;h2 id=&#34;recent-api-changes&#34;&gt;Recent API changes
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/issues/9289&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Changelog for &lt;code&gt;libllama&lt;/code&gt; API&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/issues/9291&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Changelog for &lt;code&gt;llama-server&lt;/code&gt; REST API&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;hot-topics&#34;&gt;Hot topics
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/15396&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;guide : running gpt-oss with llama.cpp&lt;/a&gt;&lt;/strong&gt;&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/15313&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;[FEEDBACK] Better packaging for llama.cpp to support downstream consumers 🤗&lt;/a&gt;&lt;/strong&gt;&lt;/li&gt;
&lt;li&gt;Support for the &lt;code&gt;gpt-oss&lt;/code&gt; model with native MXFP4 format has been added | &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/15091&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PR&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Collaboration with NVIDIA&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/15095&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Comment&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Hot PRs: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr&amp;#43;label%3Ahot&amp;#43;&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;All&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr&amp;#43;label%3Ahot&amp;#43;is%3Aopen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Open&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Multimodal support arrived in &lt;code&gt;llama-server&lt;/code&gt;: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/12898&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;#12898&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;./docs/multimodal.md&#34; &gt;documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;VS Code extension for FIM completions: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.vscode&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.vscode&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Vim/Neovim plugin for FIM completions: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.vim&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.vim&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Introducing GGUF-my-LoRA &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/10123&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.cpp/discussions/10123&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Hugging Face Inference Endpoints now support GGUF out of the box! &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/9669&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.cpp/discussions/9669&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Hugging Face GGUF editor: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/9268&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;discussion&lt;/a&gt; | &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/CISCai/gguf-editor&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;tool&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;hr&gt;
&lt;h2 id=&#34;quick-start&#34;&gt;Quick start
&lt;/h2&gt;&lt;p&gt;Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Install &lt;code&gt;llama.cpp&lt;/code&gt; using &lt;a class=&#34;link&#34; href=&#34;docs/install.md&#34; &gt;brew, nix or winget&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Run with Docker - see our &lt;a class=&#34;link&#34; href=&#34;docs/docker.md&#34; &gt;Docker documentation&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Download pre-built binaries from the &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/releases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;releases page&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Build from source by cloning this repository - check out &lt;a class=&#34;link&#34; href=&#34;docs/build.md&#34; &gt;our build guide&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Once installed, you&amp;rsquo;ll need a model to work with. Head to the &lt;a class=&#34;link&#34; href=&#34;#obtaining-and-quantizing-models&#34; &gt;Obtaining and quantizing models&lt;/a&gt; section to learn more.&lt;/p&gt;
&lt;p&gt;Example command:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-sh&#34; data-lang=&#34;sh&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Use a local model file&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -m my_model.gguf
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Or download and run a model directly from Hugging Face&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Launch OpenAI-compatible API server&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -hf ggml-org/gemma-3-1b-it-GGUF
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;description&#34;&gt;Description
&lt;/h2&gt;&lt;p&gt;The main goal of &lt;code&gt;llama.cpp&lt;/code&gt; is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
range of hardware - locally and in the cloud.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Plain C/C++ implementation without any dependencies&lt;/li&gt;
&lt;li&gt;Apple silicon is a first-class citizen - optimized via ARM NEON, Accelerate and Metal frameworks&lt;/li&gt;
&lt;li&gt;AVX, AVX2, AVX512 and AMX support for x86 architectures&lt;/li&gt;
&lt;li&gt;1.5-bit, 2-bit, 3-bit, 4-bit, 5-bit, 6-bit, and 8-bit integer quantization for faster inference and reduced memory use&lt;/li&gt;
&lt;li&gt;Custom CUDA kernels for running LLMs on NVIDIA GPUs (support for AMD GPUs via HIP and Moore Threads GPUs via MUSA)&lt;/li&gt;
&lt;li&gt;Vulkan and SYCL backend support&lt;/li&gt;
&lt;li&gt;CPU+GPU hybrid inference to partially accelerate models larger than the total VRAM capacity&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The &lt;code&gt;llama.cpp&lt;/code&gt; project is the main playground for developing new features for the &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/ggml&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ggml&lt;/a&gt; library.&lt;/p&gt;
&lt;details&gt;
&lt;summary&gt;Models&lt;/summary&gt;
&lt;p&gt;Typically finetunes of the base models below are supported as well.&lt;/p&gt;
&lt;p&gt;Instructions for adding support for new models: &lt;a class=&#34;link&#34; href=&#34;docs/development/HOWTO-add-model.md&#34; &gt;HOWTO-add-model.md&lt;/a&gt;&lt;/p&gt;
&lt;h4 id=&#34;text-only&#34;&gt;Text-only
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; LLaMA 🦙&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; LLaMA 2 🦙🦙&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; LLaMA 3 🦙🦙🦙&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/mistralai/Mistral-7B-v0.1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mistral 7B&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=mistral-ai/Mixtral&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mixtral MoE&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/databricks/dbrx-instruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;DBRX&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=tiiuae/falcon&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Falcon&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ymcui/Chinese-LLaMA-Alpaca&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Chinese LLaMA / Alpaca&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://github.com/ymcui/Chinese-LLaMA-Alpaca-2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Chinese LLaMA-2 / Alpaca-2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/bofenghuang/vigogne&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Vigogne (French)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/5423&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BERT&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://bair.berkeley.edu/blog/2023/04/03/koala/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Koala&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=baichuan-inc/Baichuan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Baichuan 1 &amp;amp; 2&lt;/a&gt; + &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/hiyouga/baichuan-7b-sft&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;derivations&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=BAAI/Aquila&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Aquila 1 &amp;amp; 2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/3187&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Starcoder models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/smallcloudai/Refact-1_6B-fim&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Refact&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/3417&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MPT&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/3553&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Bloom&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=01-ai/Yi&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yi models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/stabilityai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;StableLM models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=deepseek-ai/deepseek&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Deepseek models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=Qwen/Qwen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/3557&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PLaMo-13B&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=microsoft/phi&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Phi models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/11003&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PhiMoE&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/gpt2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GPT-2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/5118&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Orion 14B&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=internlm2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;InternLM2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/WisdomShell/codeshell&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;CodeShell&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://ai.google.dev/gemma&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Gemma&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/state-spaces/mamba&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mamba&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/keyfan/grok-1-hf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Grok-1&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=xverse&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Xverse&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=CohereForAI/c4ai-command-r&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Command-R models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=sea-lion&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SEA-LION&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/GritLM/GritLM-7B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GritLM-7B&lt;/a&gt; + &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/GritLM/GritLM-8x7B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GritLM-8x7B&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://allenai.org/olmo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OLMo&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://allenai.org/olmo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OLMo 2&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/allenai/OLMoE-1B-7B-0924&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OLMoE&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Granite models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/EleutherAI/gpt-neox&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GPT-NeoX&lt;/a&gt; + &lt;a class=&#34;link&#34; href=&#34;https://github.com/EleutherAI/pythia&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Pythia&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Snowflake-Arctic MoE&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=Smaug&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Smaug&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/LumiOpen/Poro-34B&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Poro 34B&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/1bitLLM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Bitnet b1.58 models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=flan-t5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Flan T5&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Open Elm models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM/chatglm3-6b&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ChatGLM3-6b&lt;/a&gt; + &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM/glm-4-9b&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ChatGLM4-9b&lt;/a&gt; + &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM/glm-edge-1.5b-chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLMEdge-1.5b&lt;/a&gt; + &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/THUDM/glm-edge-4b-chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLMEdge-4b&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/THUDM/glm-4-0414-67f3cbcb34dd9d252707cb2e&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLM-4-0414&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SmolLM&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;EXAONE-3.0-7.8B-Instruct&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/tiiuae/falconmamba-7b-66b9a580324dd1598b0f6d4a&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FalconMamba Models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/inceptionai/jais-13b-chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Jais&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/speakleash/bielik-11b-v23-66ee813238d9b526a072408a&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Bielik-11B-v2.3&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/BlinkDL/RWKV-LM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RWKV-6&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/recursal/QRWKV6-32B-Instruct-Preview-v0.1&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;QRWKV-6&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GigaChat-20B-A3B&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/trillionlabs/Trillion-7B-preview&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Trillion-7B-preview&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/inclusionAI/ling-67c51c85b34a7ea0aba94c32&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Ling models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LFM2 models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hunyuan models&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h4 id=&#34;multimodal&#34;&gt;Multimodal
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA 1.5 models&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaVA 1.6 models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=SkunkworksAI/Bakllava&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;BakLLaVA&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/NousResearch/Obsidian-3B-V0.5&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Obsidian&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=Lin-Chen/ShareGPT4V&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ShareGPT4V&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=mobileVLM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MobileVLM 1.7B/3B models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=Yi-VL&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Yi-VL&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=MiniCPM&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mini CPM&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/vikhyatk/moondream2&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Moondream&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/BAAI-DCAI/Bunny&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Bunny&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?search=glm-edge&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GLM-EDGE&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/Qwen/qwen2-vl-66cee7455501d7126940800d&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Qwen2-VL&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;input checked=&#34;&#34; disabled=&#34;&#34; type=&#34;checkbox&#34;&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LFM2-VL&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Bindings&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;Python: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ddh0/easy-llama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ddh0/easy-llama&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Python: &lt;a class=&#34;link&#34; href=&#34;https://github.com/abetlen/llama-cpp-python&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;abetlen/llama-cpp-python&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Go: &lt;a class=&#34;link&#34; href=&#34;https://github.com/go-skynet/go-llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;go-skynet/go-llama.cpp&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Node.js: &lt;a class=&#34;link&#34; href=&#34;https://github.com/withcatai/node-llama-cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;withcatai/node-llama-cpp&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;JS/TS (llama.cpp server client): &lt;a class=&#34;link&#34; href=&#34;https://modelfusion.dev/integration/model-provider/llamacpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;lgrammel/modelfusion&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;JS/TS (Programmable Prompt Engine CLI): &lt;a class=&#34;link&#34; href=&#34;https://github.com/offline-ai/cli&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;offline-ai/cli&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;JavaScript/Wasm (works in browser): &lt;a class=&#34;link&#34; href=&#34;https://github.com/tangledgroup/llama-cpp-wasm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;tangledgroup/llama-cpp-wasm&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Typescript/Wasm (nicer API, available on npm): &lt;a class=&#34;link&#34; href=&#34;https://github.com/ngxson/wllama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ngxson/wllama&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Ruby: &lt;a class=&#34;link&#34; href=&#34;https://github.com/yoshoku/llama_cpp.rb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;yoshoku/llama_cpp.rb&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Rust (more features): &lt;a class=&#34;link&#34; href=&#34;https://github.com/edgenai/llama_cpp-rs&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;edgenai/llama_cpp-rs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Rust (nicer API): &lt;a class=&#34;link&#34; href=&#34;https://github.com/mdrokz/rust-llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;mdrokz/rust-llama.cpp&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Rust (more direct bindings): &lt;a class=&#34;link&#34; href=&#34;https://github.com/utilityai/llama-cpp-rs&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;utilityai/llama-cpp-rs&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Rust (automated build from crates.io): &lt;a class=&#34;link&#34; href=&#34;https://github.com/ShelbyJenkins/llm_client&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ShelbyJenkins/llm_client&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;C#/.NET: &lt;a class=&#34;link&#34; href=&#34;https://github.com/SciSharp/LLamaSharp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SciSharp/LLamaSharp&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;C#/VB.NET (more features - community license): &lt;a class=&#34;link&#34; href=&#34;https://docs.lm-kit.com/lm-kit-net/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LM-Kit.NET&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Scala 3: &lt;a class=&#34;link&#34; href=&#34;https://github.com/donderom/llm4s&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;donderom/llm4s&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Clojure: &lt;a class=&#34;link&#34; href=&#34;https://github.com/phronmophobic/llama.clj&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;phronmophobic/llama.clj&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;React Native: &lt;a class=&#34;link&#34; href=&#34;https://github.com/mybigday/llama.rn&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;mybigday/llama.rn&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Java: &lt;a class=&#34;link&#34; href=&#34;https://github.com/kherud/java-llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;kherud/java-llama.cpp&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Java: &lt;a class=&#34;link&#34; href=&#34;https://github.com/QuasarByte/llama-cpp-jna&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;QuasarByte/llama-cpp-jna&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Zig: &lt;a class=&#34;link&#34; href=&#34;https://github.com/Deins/llama.cpp.zig&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;deins/llama.cpp.zig&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Flutter/Dart: &lt;a class=&#34;link&#34; href=&#34;https://github.com/netdur/llama_cpp_dart&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;netdur/llama_cpp_dart&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Flutter: &lt;a class=&#34;link&#34; href=&#34;https://github.com/xuegao-tzx/Fllama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;xuegao-tzx/Fllama&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;PHP (API bindings and features built on top of llama.cpp): &lt;a class=&#34;link&#34; href=&#34;https://github.com/distantmagic/resonance&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;distantmagic/resonance&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/pull/6326&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;(more info)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Guile Scheme: &lt;a class=&#34;link&#34; href=&#34;https://savannah.nongnu.org/projects/guile-llama-cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;guile_llama_cpp&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Swift &lt;a class=&#34;link&#34; href=&#34;https://github.com/srgtuszy/llama-cpp-swift&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;srgtuszy/llama-cpp-swift&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Swift &lt;a class=&#34;link&#34; href=&#34;https://github.com/ShenghaiWang/SwiftLlama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ShenghaiWang/SwiftLlama&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;Delphi &lt;a class=&#34;link&#34; href=&#34;https://github.com/Embarcadero/llama-cpp-delphi&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Embarcadero/llama-cpp-delphi&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;UIs&lt;/summary&gt;
&lt;p&gt;&lt;em&gt;(to have a project listed here, it should clearly state that it depends on &lt;code&gt;llama.cpp&lt;/code&gt;)&lt;/em&gt;&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/yaroslavyaroslav/OpenAI-sublime-text&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;AI Sublime Text plugin&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/cztomsik/ava&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;cztomsik/ava&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/alexpinel/Dot&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Dot&lt;/a&gt; (GPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ylsdamxssjxxdd/eva&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;eva&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/iohub/coLLaMA&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;iohub/collama&lt;/a&gt; (Apache-2.0)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/janhq/jan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;janhq/jan&lt;/a&gt; (AGPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/johnbean393/Sidekick&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;johnbean393/Sidekick&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/zhouwg/kantv?tab=readme-ov-file&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;KanTV&lt;/a&gt; (Apache-2.0)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/firatkiral/kodibot&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;KodiBot&lt;/a&gt; (GPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.vim&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama.vim&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/abgulati/LARS&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LARS&lt;/a&gt; (AGPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/vietanhdev/llama-assistant&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Llama Assistant&lt;/a&gt; (GPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/guinmoon/LLMFarm?tab=readme-ov-file&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLMFarm&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/undreamai/LLMUnity&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLMUnity&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://lmstudio.ai/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LMStudio&lt;/a&gt; (proprietary)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/mudler/LocalAI&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LocalAI&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/LostRuins/koboldcpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LostRuins/koboldcpp&lt;/a&gt; (AGPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://mindmac.app&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MindMac&lt;/a&gt; (proprietary)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/MindWorkAI/AI-Studio&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;MindWorkAI/AI-Studio&lt;/a&gt; (FSL-1.1-MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Mobile-Artificial-Intelligence/maid&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mobile-Artificial-Intelligence/maid&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Mozilla-Ocho/llamafile&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mozilla-Ocho/llamafile&lt;/a&gt; (Apache-2.0)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/nat/openplayground&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;nat/openplayground&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/nomic-ai/gpt4all&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;nomic-ai/gpt4all&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ollama/ollama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ollama/ollama&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/oobabooga/text-generation-webui&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;oobabooga/text-generation-webui&lt;/a&gt; (AGPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/a-ghorbani/pocketpal-ai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;PocketPal AI&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/psugihara/FreeChat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;psugihara/FreeChat&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ptsochantaris/emeltal&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ptsochantaris/emeltal&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/pythops/tenere&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;pythops/tenere&lt;/a&gt; (AGPL)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/containers/ramalama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ramalama&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/semperai/amica&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;semperai/amica&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/withcatai/catai&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;withcatai/catai&lt;/a&gt; (MIT)&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/blackhole89/autopen&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Autopen&lt;/a&gt; (GPL)&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Tools&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/akx/ggify&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;akx/ggify&lt;/a&gt; – download PyTorch models from HuggingFace Hub and convert them to GGML&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/akx/ollama-dl&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;akx/ollama-dl&lt;/a&gt; – download models from the Ollama library to be used directly with llama.cpp&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/crashr/gppm&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;crashr/gppm&lt;/a&gt; – launch llama.cpp instances utilizing NVIDIA Tesla P40 or P100 GPUs with reduced idle power consumption&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/gpustack/gguf-parser-go/tree/main/cmd/gguf-parser&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;gpustack/gguf-parser&lt;/a&gt; - review/check the GGUF file and estimate the memory usage&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://marketplace.unity.com/packages/tools/generative-ai/styled-lines-llama-cpp-model-292902&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Styled Lines&lt;/a&gt; (proprietary licensed, async wrapper of inference part for game development in Unity3d with pre-built Mobile and Web platform wrappers and a model example)&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Infrastructure&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/intentee/paddler&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Paddler&lt;/a&gt; - Open-source LLMOps platform for hosting and scaling AI in your own infrastructure&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/gpustack/gpustack&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GPUStack&lt;/a&gt; - Manage GPU clusters for running LLMs&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/onicai/llama_cpp_canister&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama_cpp_canister&lt;/a&gt; - llama.cpp as a smart contract on the Internet Computer, using WebAssembly&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/mostlygeek/llama-swap&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llama-swap&lt;/a&gt; - transparent proxy that adds automatic model switching with llama-server&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/kalavai-net/kalavai-client&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Kalavai&lt;/a&gt; - Crowdsource end to end LLM deployment at any scale&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/InftyAI/llmaz&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;llmaz&lt;/a&gt; - ☸️ Easy, advanced inference platform for large language models on Kubernetes.&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;details&gt;
&lt;summary&gt;Games&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/MorganRO8/Lucys_Labyrinth&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Lucy&amp;rsquo;s Labyrinth&lt;/a&gt; - A simple maze game where agents controlled by an AI model will try to trick you.&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;h2 id=&#34;supported-backends&#34;&gt;Supported backends
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Backend&lt;/th&gt;
          &lt;th&gt;Target devices&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#metal-build&#34; &gt;Metal&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Apple Silicon&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#blas-build&#34; &gt;BLAS&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;All&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/backend/BLIS.md&#34; &gt;BLIS&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;All&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/backend/SYCL.md&#34; &gt;SYCL&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Intel and Nvidia GPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#musa&#34; &gt;MUSA&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Moore Threads GPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#cuda&#34; &gt;CUDA&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Nvidia GPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#hip&#34; &gt;HIP&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;AMD GPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#vulkan&#34; &gt;Vulkan&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;GPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#cann&#34; &gt;CANN&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Ascend NPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/backend/OPENCL.md&#34; &gt;OpenCL&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;Adreno GPU&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/backend/zDNN.md&#34; &gt;IBM zDNN&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;IBM Z &amp;amp; LinuxONE&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md#webgpu&#34; &gt;WebGPU [In Progress]&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;All&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RPC&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;All&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;obtaining-and-quantizing-models&#34;&gt;Obtaining and quantizing models
&lt;/h2&gt;&lt;p&gt;The &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hugging Face&lt;/a&gt; platform hosts a &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?library=gguf&amp;amp;sort=trending&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;number of LLMs&lt;/a&gt; compatible with &lt;code&gt;llama.cpp&lt;/code&gt;:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?library=gguf&amp;amp;sort=trending&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Trending&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?sort=trending&amp;amp;search=llama&amp;#43;gguf&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;You can either manually download the GGUF file or directly use any &lt;code&gt;llama.cpp&lt;/code&gt;-compatible models from &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hugging Face&lt;/a&gt; or other model hosting sites, such as &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ModelScope&lt;/a&gt;, by using this CLI argument: &lt;code&gt;-hf &amp;lt;user&amp;gt;/&amp;lt;model&amp;gt;[:quant]&lt;/code&gt;. For example:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-sh&#34; data-lang=&#34;sh&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable &lt;code&gt;MODEL_ENDPOINT&lt;/code&gt;. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. &lt;code&gt;MODEL_ENDPOINT=https://www.modelscope.cn/&lt;/code&gt;.&lt;/p&gt;
&lt;p&gt;After downloading a model, use the CLI tools to run it locally - see below.&lt;/p&gt;
&lt;p&gt;&lt;code&gt;llama.cpp&lt;/code&gt; requires the model to be stored in the &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/ggml/blob/master/docs/gguf.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF&lt;/a&gt; file format. Models in other data formats can be converted to GGUF using the &lt;code&gt;convert_*.py&lt;/code&gt; Python scripts in this repo.&lt;/p&gt;
&lt;p&gt;The Hugging Face platform provides a variety of online tools for converting, quantizing and hosting models with &lt;code&gt;llama.cpp&lt;/code&gt;:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Use the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/ggml-org/gguf-my-repo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF-my-repo space&lt;/a&gt; to convert to GGUF format and quantize model weights to smaller sizes&lt;/li&gt;
&lt;li&gt;Use the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/ggml-org/gguf-my-lora&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF-my-LoRA space&lt;/a&gt; to convert LoRA adapters to GGUF format (more info: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/10123&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.cpp/discussions/10123&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;Use the &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/spaces/CISCai/gguf-editor&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGUF-editor space&lt;/a&gt; to edit GGUF meta data in the browser (more info: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/9268&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.cpp/discussions/9268&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;Use the &lt;a class=&#34;link&#34; href=&#34;https://ui.endpoints.huggingface.co/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Inference Endpoints&lt;/a&gt; to directly host &lt;code&gt;llama.cpp&lt;/code&gt; in the cloud (more info: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/9669&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/ggml-org/llama.cpp/discussions/9669&lt;/a&gt;)&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;To learn more about model quantization, &lt;a class=&#34;link&#34; href=&#34;tools/quantize/README.md&#34; &gt;read this documentation&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;llama-cli&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;tools/main&#34; &gt;&lt;code&gt;llama-cli&lt;/code&gt;&lt;/a&gt;
&lt;/h2&gt;&lt;h4 id=&#34;a-cli-tool-for-accessing-and-experimenting-with-most-of-llamacpps-functionality&#34;&gt;A CLI tool for accessing and experimenting with most of &lt;code&gt;llama.cpp&lt;/code&gt;&amp;rsquo;s functionality.
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;details open&gt;
  &lt;summary&gt;Run in conversation mode&lt;/summary&gt;
&lt;p&gt;Models with a built-in chat template will automatically activate conversation mode. If this doesn&amp;rsquo;t occur, you can manually enable it by adding &lt;code&gt;-cnv&lt;/code&gt; and specifying a suitable chat template with &lt;code&gt;--chat-template NAME&lt;/code&gt;&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -m model.gguf
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# &amp;gt; hi, who are you?&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Hi there! I&amp;#39;m your helpful assistant! I&amp;#39;m an AI-powered chatbot designed to assist and provide information to users like you. I&amp;#39;m here to help answer your questions, provide guidance, and offer support on a wide range of topics. I&amp;#39;m a friendly and knowledgeable AI, and I&amp;#39;m always happy to help with anything you need. What&amp;#39;s on your mind, and how can I assist you today?&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;#&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# &amp;gt; what is 1+1?&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Easy peasy! The answer to 1+1 is... 2!&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Run in conversation mode with custom chat template&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# use the &amp;#34;chatml&amp;#34; template (use -h to see the list of supported templates)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -m model.gguf -cnv --chat-template chatml
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# use a custom template&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -m model.gguf -cnv --in-prefix &lt;span class=&#34;s1&#34;&gt;&amp;#39;User: &amp;#39;&lt;/span&gt; --reverse-prompt &lt;span class=&#34;s1&#34;&gt;&amp;#39;User:&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Run simple text completion&lt;/summary&gt;
&lt;p&gt;To disable conversation mode explicitly, use &lt;code&gt;-no-cnv&lt;/code&gt;&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -m model.gguf -p &lt;span class=&#34;s2&#34;&gt;&amp;#34;I believe the meaning of life is&amp;#34;&lt;/span&gt; -n &lt;span class=&#34;m&#34;&gt;128&lt;/span&gt; -no-cnv
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# I believe the meaning of life is to find your own truth and to live in accordance with it. For me, this means being true to myself and following my passions, even if they don&amp;#39;t align with societal expectations. I think that&amp;#39;s what I love about yoga – it&amp;#39;s not just a physical practice, but a spiritual one too. It&amp;#39;s about connecting with yourself, listening to your inner voice, and honoring your own unique journey.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Constrain the output with a custom grammar&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-cli -m model.gguf -n &lt;span class=&#34;m&#34;&gt;256&lt;/span&gt; --grammar-file grammars/json.gbnf -p &lt;span class=&#34;s1&#34;&gt;&amp;#39;Request: schedule a call at 8pm; Command:&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# {&amp;#34;appointmentTime&amp;#34;: &amp;#34;8pm&amp;#34;, &amp;#34;appointmentDetails&amp;#34;: &amp;#34;schedule a a call&amp;#34;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;The &lt;a class=&#34;link&#34; href=&#34;grammars/&#34; &gt;grammars/&lt;/a&gt; folder contains a handful of sample grammars. To write your own, check out the &lt;a class=&#34;link&#34; href=&#34;grammars/README.md&#34; &gt;GBNF Guide&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;For authoring more complex JSON grammars, check out &lt;a class=&#34;link&#34; href=&#34;https://grammar.intrinsiclabs.ai/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://grammar.intrinsiclabs.ai/&lt;/a&gt;&lt;/p&gt;
  &lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;llama-server&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;tools/server&#34; &gt;&lt;code&gt;llama-server&lt;/code&gt;&lt;/a&gt;
&lt;/h2&gt;&lt;h4 id=&#34;a-lightweight-openai-api-compatible-http-server-for-serving-llms&#34;&gt;A lightweight, &lt;a class=&#34;link&#34; href=&#34;https://github.com/openai/openai-openapi&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenAI API&lt;/a&gt; compatible, HTTP server for serving LLMs.
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;details open&gt;
  &lt;summary&gt;Start a local HTTP server with default configuration on port 8080&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf --port &lt;span class=&#34;m&#34;&gt;8080&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Basic web UI can be accessed via browser: http://localhost:8080&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Chat completion endpoint: http://localhost:8080/v1/chat/completions&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Support multiple-users and parallel decoding&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# up to 4 concurrent requests, each with 4096 max context&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf -c &lt;span class=&#34;m&#34;&gt;16384&lt;/span&gt; -np &lt;span class=&#34;m&#34;&gt;4&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Enable speculative decoding&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# the draft.gguf model should be a small variant of the target model.gguf&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf -md draft.gguf
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Serve an embedding model&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# use the /embedding endpoint&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf --embedding --pooling cls -ub &lt;span class=&#34;m&#34;&gt;8192&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Serve a reranking model&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# use the /reranking endpoint&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf --reranking
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Constrain all outputs with a grammar&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# custom grammar&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf --grammar-file grammar.gbnf
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# JSON&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-server -m model.gguf --grammar-file grammars/json.gbnf
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;llama-perplexity&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;tools/perplexity&#34; &gt;&lt;code&gt;llama-perplexity&lt;/code&gt;&lt;/a&gt;
&lt;/h2&gt;&lt;h4 id=&#34;a-tool-for-measuring-the-perplexity--and-other-quality-metrics-of-a-model-over-a-given-text&#34;&gt;A tool for measuring the &lt;a class=&#34;link&#34; href=&#34;tools/perplexity/README.md&#34; &gt;perplexity&lt;/a&gt; &lt;sup id=&#34;fnref:1&#34;&gt;&lt;a href=&#34;#fn:1&#34; class=&#34;footnote-ref&#34; role=&#34;doc-noteref&#34;&gt;1&lt;/a&gt;&lt;/sup&gt; (and other quality metrics) of a model over a given text.
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;details open&gt;
  &lt;summary&gt;Measure the perplexity over a text file&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-perplexity -m model.gguf -f file.txt
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# [1]15.2701,[2]5.4007,[3]5.3073,[4]6.2965,[5]5.8940,[6]5.6096,[7]5.7942,[8]4.9297, ...&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Final estimate: PPL = 5.4007 +/- 0.67339&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Measure KL divergence&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# TODO&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;llama-bench&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;tools/llama-bench&#34; &gt;&lt;code&gt;llama-bench&lt;/code&gt;&lt;/a&gt;
&lt;/h2&gt;&lt;h4 id=&#34;benchmark-the-performance-of-the-inference-for-various-parameters&#34;&gt;Benchmark the performance of the inference for various parameters.
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;details open&gt;
  &lt;summary&gt;Run default benchmark&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;9
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-bench -m model.gguf
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Output:&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# | model               |       size |     params | backend    | threads |          test |                  t/s |&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# | ------------------- | ---------: | ---------: | ---------- | ------: | ------------: | -------------------: |&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         pp512 |      5765.41 ± 20.55 |&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# | qwen2 1.5B Q4_0     | 885.97 MiB |     1.54 B | Metal,BLAS |      16 |         tg128 |        197.71 ± 0.81 |&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;#&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# build: 3e0ba0e60 (4229)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;llama-run&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;tools/run&#34; &gt;&lt;code&gt;llama-run&lt;/code&gt;&lt;/a&gt;
&lt;/h2&gt;&lt;h4 id=&#34;a-comprehensive-example-for-running-llamacpp-models-useful-for-inferencing-used-with-ramalama-&#34;&gt;A comprehensive example for running &lt;code&gt;llama.cpp&lt;/code&gt; models. Useful for inferencing. Used with RamaLama &lt;sup id=&#34;fnref:2&#34;&gt;&lt;a href=&#34;#fn:2&#34; class=&#34;footnote-ref&#34; role=&#34;doc-noteref&#34;&gt;2&lt;/a&gt;&lt;/sup&gt;.
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Run a model with a specific prompt (by default it&#39;s pulled from Ollama registry)&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-run granite-code
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;llama-simple&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;examples/simple&#34; &gt;&lt;code&gt;llama-simple&lt;/code&gt;&lt;/a&gt;
&lt;/h2&gt;&lt;h4 id=&#34;a-minimal-example-for-implementing-apps-with-llamacpp-useful-for-developers&#34;&gt;A minimal example for implementing apps with &lt;code&gt;llama.cpp&lt;/code&gt;. Useful for developers.
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;
&lt;details&gt;
  &lt;summary&gt;Basic text completion&lt;/summary&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;llama-simple -m model.gguf
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# Hello my name is Kaitlyn and I am a 16 year old girl. I am a junior in high school and I am currently taking a class called &amp;#34;The Art of&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;  &lt;/details&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;contributing&#34;&gt;Contributing
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;Contributors can open PRs&lt;/li&gt;
&lt;li&gt;Collaborators will be invited based on contributions&lt;/li&gt;
&lt;li&gt;Maintainers can push to branches in the &lt;code&gt;llama.cpp&lt;/code&gt; repo and merge PRs into the &lt;code&gt;master&lt;/code&gt; branch&lt;/li&gt;
&lt;li&gt;Any help with managing issues, PRs and projects is very appreciated!&lt;/li&gt;
&lt;li&gt;See &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/issues?q=is%3Aissue&amp;#43;is%3Aopen&amp;#43;label%3A%22good&amp;#43;first&amp;#43;issue%22&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;good first issues&lt;/a&gt; for tasks suitable for first contributions&lt;/li&gt;
&lt;li&gt;Read the &lt;a class=&#34;link&#34; href=&#34;CONTRIBUTING.md&#34; &gt;CONTRIBUTING.md&lt;/a&gt; for more information&lt;/li&gt;
&lt;li&gt;Make sure to read this: &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/discussions/205&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Inference at the edge&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;A bit of backstory for those who are interested: &lt;a class=&#34;link&#34; href=&#34;https://changelog.com/podcast/532&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Changelog podcast&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;other-documentation&#34;&gt;Other documentation
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;tools/main/README.md&#34; &gt;main (cli)&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;tools/server/README.md&#34; &gt;server&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;grammars/README.md&#34; &gt;GBNF grammars&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h4 id=&#34;development-documentation&#34;&gt;Development documentation
&lt;/h4&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;docs/build.md&#34; &gt;How to build&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;docs/docker.md&#34; &gt;Running on Docker&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;docs/android.md&#34; &gt;Build on Android&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;docs/development/token_generation_performance_tips.md&#34; &gt;Performance troubleshooting&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/ggml-org/llama.cpp/wiki/GGML-Tips-&amp;amp;-Tricks&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;GGML tips &amp;amp; tricks&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h4 id=&#34;seminal-papers-and-background-on-the-models&#34;&gt;Seminal papers and background on the models
&lt;/h4&gt;&lt;p&gt;If your issue is with model generation quality, then please at least scan the following links and papers to understand the limitations of LLaMA models. This is especially important when choosing an appropriate model size and appreciating both the significant and subtle differences between LLaMA models and ChatGPT:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;LLaMA:
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://ai.facebook.com/blog/large-language-model-llama-meta-ai/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Introducing LLaMA: A foundational, 65-billion-parameter large language model&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2302.13971&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;LLaMA: Open and Efficient Foundation Language Models&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;GPT-3
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2005.14165&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Language Models are Few-Shot Learners&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;GPT-3.5 / InstructGPT / ChatGPT:
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://openai.com/research/instruction-following&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Aligning language models to follow instructions&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://arxiv.org/abs/2203.02155&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Training language models to follow instructions with human feedback&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;xcframework&#34;&gt;XCFramework
&lt;/h2&gt;&lt;p&gt;The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
and macOS. It can be used in Swift projects without the need to compile the
library from source. For example:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-swift&#34; data-lang=&#34;swift&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;// swift-tools-version: 5.10&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;// The swift-tools-version declares the minimum version of Swift required to build this package.&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kd&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nc&#34;&gt;PackageDescription&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kd&#34;&gt;let&lt;/span&gt; &lt;span class=&#34;nv&#34;&gt;package&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Package&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;name&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;MyLlamaPackage&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;targets&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;p&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;executableTarget&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;name&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;MyLlamaPackage&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;dependencies&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;                &lt;span class=&#34;s&#34;&gt;&amp;#34;LlamaFramework&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;p&#34;&gt;]),&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;p&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;binaryTarget&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;name&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;LlamaFramework&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;url&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;https://github.com/ggml-org/llama.cpp/releases/download/b5046/llama-b5046-xcframework.zip&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;            &lt;span class=&#34;n&#34;&gt;checksum&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;s&#34;&gt;&amp;#34;c19be78b5f00d8d29a25da41042cb7afa094cbf6280a225abe614b03b20029ab&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;The above example is using an intermediate build &lt;code&gt;b5046&lt;/code&gt; of the library. This can be modified
to use a different version by changing the URL and checksum.&lt;/p&gt;
&lt;h2 id=&#34;completions&#34;&gt;Completions
&lt;/h2&gt;&lt;p&gt;Command-line completion is available for some environments.&lt;/p&gt;
&lt;h4 id=&#34;bash-completion&#34;&gt;Bash Completion
&lt;/h4&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;$ build/bin/llama-cli --completion-bash &amp;gt; ~/.llama-completion.bash
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;$ &lt;span class=&#34;nb&#34;&gt;source&lt;/span&gt; ~/.llama-completion.bash
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Optionally this can be added to your &lt;code&gt;.bashrc&lt;/code&gt; or &lt;code&gt;.bash_profile&lt;/code&gt; to load it
automatically. For example:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-console&#34; data-lang=&#34;console&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;gp&#34;&gt;$&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;echo&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;source ~/.llama-completion.bash&amp;#34;&lt;/span&gt; &amp;gt;&amp;gt; ~/.bashrc
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;dependencies&#34;&gt;Dependencies
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/yhirose/cpp-httplib&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;yhirose/cpp-httplib&lt;/a&gt; - Single-header HTTP server, used by &lt;code&gt;llama-server&lt;/code&gt; - MIT license&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/nothings/stb&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;stb-image&lt;/a&gt; - Single-header image format decoder, used by multimodal subsystem - Public domain&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/nlohmann/json&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;nlohmann/json&lt;/a&gt; - Single-header JSON library, used by various tools/examples - MIT License&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/google/minja&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;minja&lt;/a&gt; - Minimal Jinja parser in C++, used by various tools/examples - MIT License&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;./tools/run/linenoise.cpp/linenoise.cpp&#34; &gt;linenoise.cpp&lt;/a&gt; - C++ library that provides readline-like line editing capabilities, used by &lt;code&gt;llama-run&lt;/code&gt; - BSD 2-Clause License&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://curl.se/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;curl&lt;/a&gt; - Client-side URL transfer library, used by various tools/examples - &lt;a class=&#34;link&#34; href=&#34;https://curl.se/docs/copyright.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;CURL License&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/mackron/miniaudio&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;miniaudio.h&lt;/a&gt; - Single-header audio format decoder, used by multimodal subsystem - Public domain&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;footnotes&#34; role=&#34;doc-endnotes&#34;&gt;
&lt;hr&gt;
&lt;ol&gt;
&lt;li id=&#34;fn:1&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/docs/transformers/perplexity&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://huggingface.co/docs/transformers/perplexity&lt;/a&gt;&amp;#160;&lt;a href=&#34;#fnref:1&#34; class=&#34;footnote-backref&#34; role=&#34;doc-backlink&#34;&gt;&amp;#x21a9;&amp;#xfe0e;&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;li id=&#34;fn:2&#34;&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/containers/ramalama&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;RamaLama&lt;/a&gt;&amp;#160;&lt;a href=&#34;#fnref:2&#34; class=&#34;footnote-backref&#34; role=&#34;doc-backlink&#34;&gt;&amp;#x21a9;&amp;#xfe0e;&lt;/a&gt;&lt;/p&gt;
&lt;/li&gt;
&lt;/ol&gt;
&lt;/div&gt;
</description>
        </item>
        <item>
        <title>gpt4all</title>
        <link>https://producthunt.programnotes.cn/en/p/gpt4all/</link>
        <pubDate>Thu, 14 Aug 2025 15:31:32 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/gpt4all/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1512221747435-73c38dd7afa1?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NTUxNTY2MDV8&amp;ixlib=rb-4.1.0" alt="Featured image of post gpt4all" /&gt;&lt;h1 id=&#34;nomic-aigpt4all&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/nomic-ai/gpt4all&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;nomic-ai/gpt4all&lt;/a&gt;
&lt;/h1&gt;&lt;h1 align=&#34;center&#34;&gt;GPT4All&lt;/h1&gt;
&lt;p align=&#34;center&#34;&gt;
  Now with support for DeepSeek R1 Distillations
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://www.nomic.ai/gpt4all&#34;&gt;Website&lt;/a&gt; &amp;bull; &lt;a href=&#34;https://docs.gpt4all.io&#34;&gt;Documentation&lt;/a&gt; &amp;bull; &lt;a href=&#34;https://discord.gg/mGZE39AS3e&#34;&gt;Discord&lt;/a&gt; &amp;bull; &lt;a href=&#34;https://www.youtube.com/watch?v=gQcZDXRVJok&#34;&gt;YouTube Tutorial&lt;/a&gt;
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  GPT4All runs large language models (LLMs) privately on everyday desktops &amp; laptops.
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  No API calls or GPUs required - you can just download the application and &lt;a href=&#34;https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart&#34;&gt;get started&lt;/a&gt;.
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  Read about what&#39;s new in &lt;a href=&#34;https://www.nomic.ai/blog/tag/gpt4all&#34;&gt;our blog&lt;/a&gt;.
&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
  &lt;a href=&#34;https://nomic.ai/gpt4all/#newsletter-form&#34;&gt;Subscribe to the newsletter&lt;/a&gt;
&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311&lt;/a&gt;&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
GPT4All is made possible by our compute partner &lt;a href=&#34;https://www.paperspace.com/&#34;&gt;Paperspace&lt;/a&gt;.
&lt;/p&gt;
&lt;h2 id=&#34;download-links&#34;&gt;Download Links
&lt;/h2&gt;&lt;p&gt;
  &amp;mdash; &lt;a href=&#34;https://gpt4all.io/installers/gpt4all-installer-win64.exe&#34;&gt;
    &lt;img src=&#34;gpt4all-bindings/python/docs/assets/windows.png&#34; style=&#34;height: 1em; width: auto&#34; /&gt; Windows Installer
  &lt;/a&gt; &amp;mdash;
&lt;/p&gt;
&lt;p&gt;
  &amp;mdash; &lt;a href=&#34;https://gpt4all.io/installers/gpt4all-installer-win64-arm.exe&#34;&gt;
    &lt;img src=&#34;gpt4all-bindings/python/docs/assets/windows.png&#34; style=&#34;height: 1em; width: auto&#34; /&gt; Windows ARM Installer
  &lt;/a&gt; &amp;mdash;
&lt;/p&gt;
&lt;p&gt;
  &amp;mdash; &lt;a href=&#34;https://gpt4all.io/installers/gpt4all-installer-darwin.dmg&#34;&gt;
    &lt;img src=&#34;gpt4all-bindings/python/docs/assets/mac.png&#34; style=&#34;height: 1em; width: auto&#34; /&gt; macOS Installer
  &lt;/a&gt; &amp;mdash;
&lt;/p&gt;
&lt;p&gt;
  &amp;mdash; &lt;a href=&#34;https://gpt4all.io/installers/gpt4all-installer-linux.run&#34;&gt;
    &lt;img src=&#34;gpt4all-bindings/python/docs/assets/ubuntu.svg&#34; style=&#34;height: 1em; width: auto&#34; /&gt; Ubuntu Installer
  &lt;/a&gt; &amp;mdash;
&lt;/p&gt;
&lt;p&gt;
  The Windows and Linux builds require Intel Core i3 2nd Gen / AMD Bulldozer, or better.
&lt;/p&gt;
&lt;p&gt;
  The Windows ARM build supports Qualcomm Snapdragon and Microsoft SQ1/SQ2 processors.
&lt;/p&gt;
&lt;p&gt;
  The Linux build is x86-64 only (no ARM).
&lt;/p&gt;
&lt;p&gt;
  The macOS build requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
&lt;/p&gt;
&lt;p&gt;See the full &lt;a class=&#34;link&#34; href=&#34;gpt4all-chat/system_requirements.md&#34; &gt;System Requirements&lt;/a&gt; for more details.&lt;/p&gt;
&lt;br/&gt;
&lt;br/&gt;
&lt;p&gt;
  &lt;a href=&#39;https://flathub.org/apps/io.gpt4all.gpt4all&#39;&gt;
    &lt;img style=&#34;height: 2em; width: auto&#34; alt=&#39;Get it on Flathub&#39; src=&#39;https://flathub.org/api/badge&#39;&gt;&lt;br/&gt;
    Flathub (community maintained)
  &lt;/a&gt;
&lt;/p&gt;
&lt;h2 id=&#34;install-gpt4all-python&#34;&gt;Install GPT4All Python
&lt;/h2&gt;&lt;p&gt;&lt;code&gt;gpt4all&lt;/code&gt; gives you access to LLMs with our Python client around &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggerganov/llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;code&gt;llama.cpp&lt;/code&gt;&lt;/a&gt; implementations.&lt;/p&gt;
&lt;p&gt;Nomic contributes to open source software like &lt;a class=&#34;link&#34; href=&#34;https://github.com/ggerganov/llama.cpp&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;code&gt;llama.cpp&lt;/code&gt;&lt;/a&gt; to make LLMs accessible and efficient &lt;strong&gt;for all&lt;/strong&gt;.&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install gpt4all
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;gpt4all&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;GPT4All&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;GPT4All&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;Meta-Llama-3-8B-Instruct.Q4_0.gguf&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# downloads / loads a 4.66GB LLM&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;with&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chat_session&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;():&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;How can I run LLMs efficiently on my laptop?&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;max_tokens&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1024&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;integrations&#34;&gt;Integrations
&lt;/h2&gt;&lt;p&gt;:parrot::link: &lt;a class=&#34;link&#34; href=&#34;https://python.langchain.com/v0.2/docs/integrations/providers/gpt4all/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Langchain&lt;/a&gt;
:card_file_box: &lt;a class=&#34;link&#34; href=&#34;https://github.com/weaviate/weaviate&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Weaviate Vector Database&lt;/a&gt; - &lt;a class=&#34;link&#34; href=&#34;https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;module docs&lt;/a&gt;
:telescope: &lt;a class=&#34;link&#34; href=&#34;https://github.com/openlit/openlit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;OpenLIT (OTel-native Monitoring)&lt;/a&gt; - &lt;a class=&#34;link&#34; href=&#34;https://docs.openlit.io/latest/integrations/gpt4all&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docs&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;release-history&#34;&gt;Release History
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;strong&gt;July 2nd, 2024&lt;/strong&gt;: V3.0.0 Release
&lt;ul&gt;
&lt;li&gt;Fresh redesign of the chat application UI&lt;/li&gt;
&lt;li&gt;Improved user workflow for LocalDocs&lt;/li&gt;
&lt;li&gt;Expanded access to more model architectures&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;October 19th, 2023&lt;/strong&gt;: GGUF Support Launches with Support for:
&lt;ul&gt;
&lt;li&gt;Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Nomic Vulkan&lt;/a&gt; support for Q4_0 and Q4_1 quantizations in GGUF.&lt;/li&gt;
&lt;li&gt;Offline build support for running old versions of the GPT4All Local LLM Chat Client.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;September 18th, 2023&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Nomic Vulkan&lt;/a&gt; launches supporting local LLM inference on NVIDIA and AMD GPUs.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;July 2023&lt;/strong&gt;: Stable support for LocalDocs, a feature that allows you to privately and locally chat with your data.&lt;/li&gt;
&lt;li&gt;&lt;strong&gt;June 28th, 2023&lt;/strong&gt;: &lt;a class=&#34;link&#34; href=&#34;https://github.com/nomic-ai/gpt4all/tree/cef74c2be20f5b697055d5b8b506861c7b997fab/gpt4all-api&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Docker-based API server&lt;/a&gt; launches allowing inference of local LLMs from an OpenAI-compatible HTTP endpoint.&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;contributing&#34;&gt;Contributing
&lt;/h2&gt;&lt;p&gt;GPT4All welcomes contributions, involvement, and discussion from the open source community!
Please see CONTRIBUTING.md and follow the issues, bug reports, and PR markdown templates.&lt;/p&gt;
&lt;p&gt;Check project discord, with project owners, or through existing issues/PRs to avoid duplicate work.
Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
Example tags: &lt;code&gt;backend&lt;/code&gt;, &lt;code&gt;bindings&lt;/code&gt;, &lt;code&gt;python-bindings&lt;/code&gt;, &lt;code&gt;documentation&lt;/code&gt;, etc.&lt;/p&gt;
&lt;h2 id=&#34;citation&#34;&gt;Citation
&lt;/h2&gt;&lt;p&gt;If you utilize this repository, models or data in a downstream project, please consider citing it with:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-fallback&#34; data-lang=&#34;fallback&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;@misc{gpt4all,
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  author = {Yuvanesh Anand and Zach Nussbaum and Brandon Duderstadt and Benjamin Schmidt and Andriy Mulyar},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  title = {GPT4All: Training an Assistant-style Chatbot with Large Scale Data Distillation from GPT-3.5-Turbo},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  year = {2023},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  publisher = {GitHub},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  journal = {GitHub repository},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  howpublished = {\url{https://github.com/nomic-ai/gpt4all}},
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;}
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;</description>
        </item>
        
    </channel>
</rss>
