<?xml version="1.0" encoding="utf-8" standalone="yes"?>
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
    <channel>
        <title>ASR on Producthunt daily</title>
        <link>https://producthunt.programnotes.cn/en/tags/asr/</link>
        <description>Recent content in ASR on Producthunt daily</description>
        <generator>Hugo -- gohugo.io</generator>
        <language>en</language>
        <lastBuildDate>Sat, 10 May 2025 15:25:32 +0800</lastBuildDate><atom:link href="https://producthunt.programnotes.cn/en/tags/asr/index.xml" rel="self" type="application/rss+xml" /><item>
        <title>NeMo</title>
        <link>https://producthunt.programnotes.cn/en/p/nemo/</link>
        <pubDate>Sat, 10 May 2025 15:25:32 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/nemo/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1729952832073-bf7d3d6150cd?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NDY4NjE4OTR8&amp;ixlib=rb-4.1.0" alt="Featured image of post NeMo" /&gt;&lt;h1 id=&#34;nvidianemo&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA/NeMo&lt;/a&gt;
&lt;/h1&gt;&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;http://www.repostatus.org/#active&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;http://www.repostatus.org/badges/latest/active.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Project Status: Active – The project has reached a stable, usable state and is being actively developed.&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://readthedocs.com/projects/nvidia-nemo/badge/?version=main&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Documentation&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/nvidia/nemo/actions/workflows/codeql.yml&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://github.com/nvidia/nemo/actions/workflows/codeql.yml/badge.svg?branch=main&amp;amp;event=push&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;CodeQL&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/blob/master/LICENSE&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;NeMo core license and license for collections in this repo&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://badge.fury.io/py/nemo-toolkit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://badge.fury.io/py/nemo-toolkit.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Release version&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://badge.fury.io/py/nemo-toolkit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/pypi/pyversions/nemo-toolkit.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Python version&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://pepy.tech/project/nemo-toolkit&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://static.pepy.tech/personalized-badge/nemo-toolkit?period=total&amp;amp;units=international_system&amp;amp;left_color=grey&amp;amp;right_color=brightgreen&amp;amp;left_text=downloads&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;PyPi total downloads&#34;
	
	
&gt;&lt;/a&gt;
&lt;a class=&#34;link&#34; href=&#34;https://github.com/psf/black&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/badge/code%20style-black-000000.svg&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Code style: black&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h1 id=&#34;nvidia-nemo-framework&#34;&gt;&lt;strong&gt;NVIDIA NeMo Framework&lt;/strong&gt;
&lt;/h1&gt;&lt;h2 id=&#34;latest-news&#34;&gt;Latest News
&lt;/h2&gt;&lt;!-- markdownlint-disable --&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;Pretrain and finetune :hugs:Hugging Face models via AutoModel&lt;/b&gt;&lt;/summary&gt;
      Nemo Framework&#39;s latest feature AutoModel enables broad support for :hugs:Hugging Face models, with 25.02 focusing on &lt;a href=https://huggingface.co/transformers/v3.5.1/model_doc/auto.html#automodelforcausallm&gt;AutoModelForCausalLM&lt;a&gt; in the &lt;a href=https://huggingface.co/models?pipeline_tag=text-generation&amp;sort=trending&gt;text generation category&lt;a&gt;. Future releases will enable support for more model families such as Vision Language Model.
&lt;/details&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;Training on Blackwell using Nemo&lt;/b&gt;&lt;/summary&gt;
      NeMo Framework has added Blackwell support, with 25.02 focusing on functional parity for B200. More optimizations to come in the upcoming releases.
&lt;/details&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;NeMo Framework 2.0&lt;/b&gt;&lt;/summary&gt;
      We&#39;ve released NeMo 2.0, an update on the NeMo Framework which prioritizes modularity and ease-of-use. Please refer to the &lt;a href=https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html&gt;NeMo Framework User Guide&lt;/a&gt; to get started.
&lt;/details&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;New Cosmos World Foundation Models Support&lt;/b&gt;&lt;/summary&gt;
    &lt;details&gt; 
      &lt;summary&gt; &lt;a href=&#34;https://developer.nvidia.com/blog/advancing-physical-ai-with-nvidia-cosmos-world-foundation-model-platform&#34;&gt;Advancing Physical AI with NVIDIA Cosmos World Foundation Model Platform &lt;/a&gt; (2025-01-09) 
      &lt;/summary&gt; 
        The end-to-end NVIDIA Cosmos platform accelerates world model development for physical AI systems. Built on CUDA, Cosmos combines state-of-the-art world foundation models, video tokenizers, and AI-accelerated data processing pipelines. Developers can accelerate world model development by fine-tuning Cosmos world foundation models or building new ones from the ground up. These models create realistic synthetic videos of environments and interactions, providing a scalable foundation for training complex systems, from simulating humanoid robots performing advanced actions to developing end-to-end autonomous driving models. 
        &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/accelerate-custom-video-foundation-model-pipelines-with-new-nvidia-nemo-framework-capabilities/&#34;&gt;
          Accelerate Custom Video Foundation Model Pipelines with New NVIDIA NeMo Framework Capabilities
        &lt;/a&gt; (2025-01-07)
      &lt;/summary&gt;
        The NeMo Framework now supports training and customizing the &lt;a href=&#34;https://github.com/NVIDIA/Cosmos&#34;&gt;NVIDIA Cosmos&lt;/a&gt; collection of world foundation models. Cosmos leverages advanced text-to-world generation techniques to create fluid, coherent video content from natural language prompts.
        &lt;br&gt;&lt;br&gt;
        You can also now accelerate your video processing step using the &lt;a href=&#34;https://developer.nvidia.com/nemo-curator-video-processing-early-access&#34;&gt;NeMo Curator&lt;/a&gt; library, which provides optimized video processing and captioning features that can deliver up to 89x faster video processing when compared to an unoptimized CPU pipeline.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
&lt;/details&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;Large Language Models and Multimodal Models&lt;/b&gt;&lt;/summary&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/state-of-the-art-multimodal-generative-ai-model-development-with-nvidia-nemo/&#34;&gt;
          State-of-the-Art Multimodal Generative AI Model Development with NVIDIA NeMo
        &lt;/a&gt; (2024-11-06)
      &lt;/summary&gt;
        NVIDIA recently announced significant enhancements to the NeMo platform, focusing on multimodal generative AI models. The update includes NeMo Curator and the Cosmos tokenizer, which streamline the data curation process and enhance the quality of visual data. These tools are designed to handle large-scale data efficiently, making it easier to develop high-quality AI models for various applications, including robotics and autonomous driving. The Cosmos tokenizers, in particular, efficiently map visual data into compact, semantic tokens, which is crucial for training large-scale generative models. The tokenizer is available now on the &lt;a href=http://github.com/NVIDIA/cosmos-tokenizer/NVIDIA/cosmos-tokenizer&gt;NVIDIA/cosmos-tokenizer&lt;/a&gt; GitHub repo and on &lt;a href=https://huggingface.co/nvidia/Cosmos-Tokenizer-CV8x8x8&gt;Hugging Face&lt;/a&gt;.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/llama/index.html#new-llama-3-1-support for more information/&#34;&gt;
        New Llama 3.1 Support
        &lt;/a&gt; (2024-07-23)
      &lt;/summary&gt;
        The NeMo Framework now supports training and customizing the Llama 3.1 collection of LLMs from Meta.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://aws.amazon.com/blogs/machine-learning/accelerate-your-generative-ai-distributed-training-workloads-with-the-nvidia-nemo-framework-on-amazon-eks/&#34;&gt;
          Accelerate your Generative AI Distributed Training Workloads with the NVIDIA NeMo Framework on Amazon EKS
        &lt;/a&gt; (2024-07-16)
      &lt;/summary&gt;
     NVIDIA NeMo Framework now runs distributed training workloads on an Amazon Elastic Kubernetes Service (Amazon EKS) cluster. For step-by-step instructions on creating an EKS cluster and running distributed training workloads with NeMo, see the GitHub repository &lt;a href=&#34;https://github.com/aws-samples/awsome-distributed-training/tree/main/3.test_cases/2.nemo-launcher/EKS/&#34;&gt; here.&lt;/a&gt;
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/nvidia-nemo-accelerates-llm-innovation-with-hybrid-state-space-model-support/&#34;&gt;
          NVIDIA NeMo Accelerates LLM Innovation with Hybrid State Space Model Support
        &lt;/a&gt; (2024/06/17)
      &lt;/summary&gt;
     NVIDIA NeMo and Megatron Core now support pre-training and fine-tuning of state space models (SSMs). NeMo also supports training models based on the Griffin architecture as described by Google DeepMind. 
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
      &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://huggingface.co/models?sort=trending&amp;search=nvidia%2Fnemotron-4-340B&#34;&gt;
          NVIDIA releases 340B base, instruct, and reward models pretrained on a total of 9T tokens.
        &lt;/a&gt; (2024-06-18)
      &lt;/summary&gt;
      See documentation and tutorials for SFT, PEFT, and PTQ with 
      &lt;a href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/llms/nemotron/index.html&#34;&gt;
        Nemotron 340B 
      &lt;/a&gt;
      in the NeMo Framework User Guide.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/nvidia-sets-new-generative-ai-performance-and-scale-records-in-mlperf-training-v4-0/&#34;&gt;
          NVIDIA sets new generative AI performance and scale records in MLPerf Training v4.0
        &lt;/a&gt; (2024/06/12)
      &lt;/summary&gt;
      Using NVIDIA NeMo Framework and NVIDIA Hopper GPUs NVIDIA was able to scale to 11,616 H100 GPUs and achieve near-linear performance scaling on LLM pretraining. 
      NVIDIA also achieved the highest LLM fine-tuning performance and raised the bar for text-to-image training.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
        &lt;summary&gt;
          &lt;a href=&#34;https://cloud.google.com/blog/products/compute/gke-and-nvidia-nemo-framework-to-train-generative-ai-models&#34;&gt;
            Accelerate your generative AI journey with NVIDIA NeMo Framework on GKE
          &lt;/a&gt; (2024/03/16)
        &lt;/summary&gt;
        An end-to-end walkthrough to train generative AI models on the Google Kubernetes Engine (GKE) using the NVIDIA NeMo Framework is available at https://github.com/GoogleCloudPlatform/nvidia-nemo-on-gke. 
        The walkthrough includes detailed instructions on how to set up a Google Cloud Project and pre-train a GPT model using the NeMo Framework.
        &lt;br&gt;&lt;br&gt;
      &lt;/details&gt;
&lt;/details&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;Speech Recognition&lt;/b&gt;&lt;/summary&gt;
  &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/accelerating-leaderboard-topping-asr-models-10x-with-nvidia-nemo/&#34;&gt;
          Accelerating Leaderboard-Topping ASR Models 10x with NVIDIA NeMo
        &lt;/a&gt; (2024/09/24)
      &lt;/summary&gt;
      NVIDIA NeMo team released a number of inference optimizations for CTC, RNN-T, and TDT models that resulted in up to 10x inference speed-up. 
      These models now exceed an inverse real-time factor (RTFx) of 2,000, with some reaching RTFx of even 6,000.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/new-standard-for-speech-recognition-and-translation-from-the-nvidia-nemo-canary-model/&#34;&gt;
          New Standard for Speech Recognition and Translation from the NVIDIA NeMo Canary Model
        &lt;/a&gt; (2024/04/18)
      &lt;/summary&gt;
      The NeMo team just released Canary, a multilingual model that transcribes speech in English, Spanish, German, and French with punctuation and capitalization. 
      Canary also provides bi-directional translation, between English and the three other supported languages.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/pushing-the-boundaries-of-speech-recognition-with-nemo-parakeet-asr-models/&#34;&gt;
          Pushing the Boundaries of Speech Recognition with NVIDIA NeMo Parakeet ASR Models
        &lt;/a&gt; (2024/04/18)
      &lt;/summary&gt;
      NVIDIA NeMo, an end-to-end platform for the development of multimodal generative AI models at scale anywhere—on any cloud and on-premises—released the Parakeet family of automatic speech recognition (ASR) models. 
      These state-of-the-art ASR models, developed in collaboration with Suno.ai, transcribe spoken English with exceptional accuracy.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
  &lt;details&gt;
    &lt;summary&gt;
      &lt;a href=&#34;https://developer.nvidia.com/blog/turbocharge-asr-accuracy-and-speed-with-nvidia-nemo-parakeet-tdt/&#34;&gt;
        Turbocharge ASR Accuracy and Speed with NVIDIA NeMo Parakeet-TDT
      &lt;/a&gt; (2024/04/18)
    &lt;/summary&gt;
    NVIDIA NeMo, an end-to-end platform for developing multimodal generative AI models at scale anywhere—on any cloud and on-premises—recently released Parakeet-TDT. 
    This new addition to the  NeMo ASR Parakeet model family boasts better accuracy and 64% greater speed over the previously best model, Parakeet-RNNT-1.1B.
    &lt;br&gt;&lt;br&gt;
  &lt;/details&gt;
&lt;/details&gt;
&lt;!-- markdownlint-enable --&gt;
&lt;h2 id=&#34;introduction&#34;&gt;Introduction
&lt;/h2&gt;&lt;p&gt;NVIDIA NeMo Framework is a scalable and cloud-native generative AI
framework built for researchers and PyTorch developers working on Large
Language Models (LLMs), Multimodal Models (MMs), Automatic Speech
Recognition (ASR), Text to Speech (TTS), and Computer Vision (CV)
domains. It is designed to help you efficiently create, customize, and
deploy new generative AI models by leveraging existing code and
pre-trained model checkpoints.&lt;/p&gt;
&lt;p&gt;For technical documentation, please see the &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo Framework User
Guide&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;whats-new-in-nemo-20&#34;&gt;What&amp;rsquo;s New in NeMo 2.0
&lt;/h2&gt;&lt;p&gt;NVIDIA NeMo 2.0 introduces several significant improvements over its predecessor, NeMo 1.0, enhancing flexibility, performance, and scalability.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Python-Based Configuration&lt;/strong&gt; - NeMo 2.0 transitions from YAML files to a Python-based configuration, providing more flexibility and control. This shift makes it easier to extend and customize configurations programmatically.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Modular Abstractions&lt;/strong&gt; - By adopting PyTorch Lightning’s modular abstractions, NeMo 2.0 simplifies adaptation and experimentation. This modular approach allows developers to more easily modify and experiment with different components of their models.&lt;/p&gt;
&lt;/li&gt;
&lt;li&gt;
&lt;p&gt;&lt;strong&gt;Scalability&lt;/strong&gt; - NeMo 2.0 seamlessly scaling large-scale experiments across thousands of GPUs using &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo-Run&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo-Run&lt;/a&gt;, a powerful tool designed to streamline the configuration, execution, and management of machine learning experiments across computing environments.&lt;/p&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Overall, these enhancements make NeMo 2.0 a powerful, scalable, and user-friendly framework for AI model development.&lt;/p&gt;
&lt;blockquote&gt;
&lt;p&gt;[!IMPORTANT]&lt;br&gt;
NeMo 2.0 is currently supported by the LLM (large language model) and VLM (vision language model) collections.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;h3 id=&#34;get-started-with-nemo-20&#34;&gt;Get Started with NeMo 2.0
&lt;/h3&gt;&lt;ul&gt;
&lt;li&gt;Refer to the &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/quickstart.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Quickstart&lt;/a&gt; for examples of using NeMo-Run to launch NeMo 2.0 experiments locally and on a slurm cluster.&lt;/li&gt;
&lt;li&gt;For more information about NeMo 2.0, see the &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo Framework User Guide&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/recipes&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo 2.0 Recipes&lt;/a&gt; contains additional examples of launching large-scale runs using NeMo 2.0 and NeMo-Run.&lt;/li&gt;
&lt;li&gt;For an in-depth exploration of the main features of NeMo 2.0, see the &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/features/index.html#feature-guide&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Feature Guide&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;To transition from NeMo 1.0 to 2.0, see the &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/nemo-2.0/migration/index.html#migration-guide&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Migration Guide&lt;/a&gt; for step-by-step instructions.&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;get-started-with-cosmos&#34;&gt;Get Started with Cosmos
&lt;/h3&gt;&lt;p&gt;NeMo Curator and NeMo Framework support video curation and post-training of the Cosmos World Foundation Models, which are open and available on &lt;a class=&#34;link&#34; href=&#34;https://catalog.ngc.nvidia.com/orgs/nvidia/teams/cosmos/collections/cosmos&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NGC&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/collections/nvidia/cosmos-6751e884dc10e013a0a0d8e6&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hugging Face&lt;/a&gt;. For more information on video datasets, refer to &lt;a class=&#34;link&#34; href=&#34;https://developer.nvidia.com/nemo-curator&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo Curator&lt;/a&gt;. To post-train World Foundation Models using the NeMo Framework for your custom physical AI tasks, see the &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/diffusion/nemo/post_training/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Cosmos Diffusion models&lt;/a&gt; and the &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/Cosmos/blob/main/cosmos1/models/autoregressive/nemo/post_training/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Cosmos Autoregressive models&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;llms-and-mms-training-alignment-and-customization&#34;&gt;LLMs and MMs Training, Alignment, and Customization
&lt;/h2&gt;&lt;p&gt;All NeMo models are trained with
&lt;a class=&#34;link&#34; href=&#34;https://github.com/Lightning-AI/lightning&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Lightning&lt;/a&gt;. Training is
automatically scalable to 1000s of GPUs. You can check the performance benchmarks using the
latest NeMo Framework container &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/performance/performance_summary.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;here&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;When applicable, NeMo models leverage cutting-edge distributed training
techniques, incorporating &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/modeloverview.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;parallelism
strategies&lt;/a&gt;
to enable efficient training of very large models. These techniques
include Tensor Parallelism (TP), Pipeline Parallelism (PP), Fully
Sharded Data Parallelism (FSDP), Mixture-of-Experts (MoE), and Mixed
Precision Training with BFloat16 and FP8, as well as others.&lt;/p&gt;
&lt;p&gt;NeMo Transformer-based LLMs and MMs utilize &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/TransformerEngine&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA Transformer
Engine&lt;/a&gt; for FP8 training on
NVIDIA Hopper GPUs, while leveraging &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA Megatron
Core&lt;/a&gt; for
scaling Transformer model training.&lt;/p&gt;
&lt;p&gt;NeMo LLMs can be aligned with state-of-the-art methods such as SteerLM,
Direct Preference Optimization (DPO), and Reinforcement Learning from
Human Feedback (RLHF). See &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo-Aligner&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA NeMo
Aligner&lt;/a&gt; for more information.&lt;/p&gt;
&lt;p&gt;In addition to supervised fine-tuning (SFT), NeMo also supports the
latest parameter efficient fine-tuning (PEFT) techniques such as LoRA,
P-Tuning, Adapters, and IA3. Refer to the &lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/sft_peft/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo Framework User
Guide&lt;/a&gt;
for the full list of supported models and techniques.&lt;/p&gt;
&lt;h2 id=&#34;llms-and-mms-deployment-and-optimization&#34;&gt;LLMs and MMs Deployment and Optimization
&lt;/h2&gt;&lt;p&gt;NeMo LLMs and MMs can be deployed and optimized with &lt;a class=&#34;link&#34; href=&#34;https://developer.nvidia.com/nemo-microservices-early-access&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA NeMo
Microservices&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;speech-ai&#34;&gt;Speech AI
&lt;/h2&gt;&lt;p&gt;NeMo ASR and TTS models can be optimized for inference and deployed for
production use cases with &lt;a class=&#34;link&#34; href=&#34;https://developer.nvidia.com/riva&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA Riva&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;nemo-framework-launcher&#34;&gt;NeMo Framework Launcher
&lt;/h2&gt;&lt;blockquote&gt;
&lt;p&gt;[!IMPORTANT]&lt;br&gt;
NeMo Framework Launcher is compatible with NeMo version 1.0 only. &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo-Run&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo-Run&lt;/a&gt; is recommended for launching experiments using NeMo 2.0.&lt;/p&gt;
&lt;/blockquote&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo-Megatron-Launcher&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo Framework
Launcher&lt;/a&gt; is a
cloud-native tool that streamlines the NeMo Framework experience. It is
used for launching end-to-end NeMo Framework training jobs on CSPs and
Slurm clusters.&lt;/p&gt;
&lt;p&gt;The NeMo Framework Launcher includes extensive recipes, scripts,
utilities, and documentation for training NeMo LLMs. It also includes
the NeMo Framework &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo-Megatron-Launcher#53-using-autoconfigurator-to-find-the-optimal-configuration&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Autoconfigurator&lt;/a&gt;,
which is designed to find the optimal model parallel configuration for
training on a specific cluster.&lt;/p&gt;
&lt;p&gt;To get started quickly with the NeMo Framework Launcher, please see the
&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo Framework
Playbooks&lt;/a&gt;.
The NeMo Framework Launcher does not currently support ASR and TTS
training, but it will soon.&lt;/p&gt;
&lt;h2 id=&#34;get-started-with-nemo-framework&#34;&gt;Get Started with NeMo Framework
&lt;/h2&gt;&lt;p&gt;Getting started with NeMo Framework is easy. State-of-the-art pretrained
NeMo models are freely available on &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/models?library=nemo&amp;amp;sort=downloads&amp;amp;search=nvidia&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Hugging Face
Hub&lt;/a&gt;
and &lt;a class=&#34;link&#34; href=&#34;https://catalog.ngc.nvidia.com/models?query=nemo&amp;amp;orderBy=weightPopularDESC&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA
NGC&lt;/a&gt;.
These models can be used to generate text or images, transcribe audio,
and synthesize speech in just a few lines of code.&lt;/p&gt;
&lt;p&gt;We have extensive
&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/starthere/tutorials.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;tutorials&lt;/a&gt;
that can be run on &lt;a class=&#34;link&#34; href=&#34;https://colab.research.google.com&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Google Colab&lt;/a&gt; or
with our &lt;a class=&#34;link&#34; href=&#34;https://catalog.ngc.nvidia.com/orgs/nvidia/containers/nemo&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NGC NeMo Framework
Container&lt;/a&gt;.
We also have
&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/nemo-framework/user-guide/latest/playbooks/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;playbooks&lt;/a&gt;
for users who want to train NeMo models with the NeMo Framework
Launcher.&lt;/p&gt;
&lt;p&gt;For advanced users who want to train NeMo models from scratch or
fine-tune existing NeMo models, we have a full suite of &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/tree/main/examples&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;example
scripts&lt;/a&gt; that support
multi-GPU/multi-node training.&lt;/p&gt;
&lt;h2 id=&#34;key-features&#34;&gt;Key Features
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;nemo/collections/nlp/README.md&#34; &gt;Large Language Models&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;nemo/collections/multimodal/README.md&#34; &gt;Multimodal&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;nemo/collections/asr/README.md&#34; &gt;Automatic Speech Recognition&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;nemo/collections/tts/README.md&#34; &gt;Text to Speech&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;nemo/collections/vision/README.md&#34; &gt;Computer Vision&lt;/a&gt;&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;requirements&#34;&gt;Requirements
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;Python 3.10 or above&lt;/li&gt;
&lt;li&gt;Pytorch 2.5 or above&lt;/li&gt;
&lt;li&gt;NVIDIA GPU (if you intend to do model training)&lt;/li&gt;
&lt;/ul&gt;
&lt;h2 id=&#34;developer-documentation&#34;&gt;Developer Documentation
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;Version&lt;/th&gt;
          &lt;th&gt;Status&lt;/th&gt;
          &lt;th&gt;Description&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;Latest&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://readthedocs.com/projects/nvidia-nemo/badge/?version=main&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Documentation Status&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Documentation of the latest (i.e. main) branch.&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;Stable&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://readthedocs.com/projects/nvidia-nemo/badge/?version=stable&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;Documentation Status&#34;
	
	
&gt;&lt;/a&gt;&lt;/td&gt;
          &lt;td&gt;&lt;a class=&#34;link&#34; href=&#34;https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/stable/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Documentation of the stable (i.e. most recent release)&lt;/a&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;install-nemo-framework&#34;&gt;Install NeMo Framework
&lt;/h2&gt;&lt;p&gt;The NeMo Framework can be installed in a variety of ways, depending on
your needs. Depending on the domain, you may find one of the following
installation methods more suitable.&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#conda--pip&#34; &gt;Conda / Pip&lt;/a&gt;: Install NeMo-Framework with native Pip into a virtual environment.
&lt;ul&gt;
&lt;li&gt;Used to explore NeMo on any supported platform.&lt;/li&gt;
&lt;li&gt;This is the recommended method for ASR and TTS domains.&lt;/li&gt;
&lt;li&gt;Limited feature-completeness for other domains.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#ngc-pytorch-container&#34; &gt;NGC PyTorch container&lt;/a&gt;: Install NeMo-Framework from source with feature-completeness into a highly optimized container.
&lt;ul&gt;
&lt;li&gt;For users that want to install from source in a highly optimized container.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;#ngc-nemo-container&#34; &gt;NGC NeMo container&lt;/a&gt;: Ready-to-go solution of NeMo-Framework
&lt;ul&gt;
&lt;li&gt;For users that seek highest performance.&lt;/li&gt;
&lt;li&gt;Contains all dependencies installed and tested for performance and convergence.&lt;/li&gt;
&lt;/ul&gt;
&lt;/li&gt;
&lt;/ul&gt;
&lt;h3 id=&#34;support-matrix&#34;&gt;Support matrix
&lt;/h3&gt;&lt;p&gt;NeMo-Framework provides tiers of support based on OS / Platform and mode of installation. Please refer the following overview of support levels:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;Fully supported: Max performance and feature-completeness.&lt;/li&gt;
&lt;li&gt;Limited supported: Used to explore NeMo.&lt;/li&gt;
&lt;li&gt;No support yet: In development.&lt;/li&gt;
&lt;li&gt;Deprecated: Support has reached end of life.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;Please refer to the following table for current support levels:&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th&gt;OS / Platform&lt;/th&gt;
          &lt;th&gt;Install from PyPi&lt;/th&gt;
          &lt;th&gt;Source into NGC container&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;code&gt;linux&lt;/code&gt; - &lt;code&gt;amd64/x84_64&lt;/code&gt;&lt;/td&gt;
          &lt;td&gt;Limited support&lt;/td&gt;
          &lt;td&gt;Full support&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;code&gt;linux&lt;/code&gt; - &lt;code&gt;arm64&lt;/code&gt;&lt;/td&gt;
          &lt;td&gt;Limited support&lt;/td&gt;
          &lt;td&gt;Limited support&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;code&gt;darwin&lt;/code&gt; - &lt;code&gt;amd64/x64_64&lt;/code&gt;&lt;/td&gt;
          &lt;td&gt;Deprecated&lt;/td&gt;
          &lt;td&gt;Deprecated&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;code&gt;darwin&lt;/code&gt; - &lt;code&gt;arm64&lt;/code&gt;&lt;/td&gt;
          &lt;td&gt;Limited support&lt;/td&gt;
          &lt;td&gt;Limited support&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;code&gt;windows&lt;/code&gt; - &lt;code&gt;amd64/x64_64&lt;/code&gt;&lt;/td&gt;
          &lt;td&gt;No support yet&lt;/td&gt;
          &lt;td&gt;No support yet&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td&gt;&lt;code&gt;windows&lt;/code&gt; - &lt;code&gt;arm64&lt;/code&gt;&lt;/td&gt;
          &lt;td&gt;No support yet&lt;/td&gt;
          &lt;td&gt;No support yet&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h3 id=&#34;conda--pip&#34;&gt;Conda / Pip
&lt;/h3&gt;&lt;p&gt;Install NeMo in a fresh Conda environment:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;conda create --name nemo &lt;span class=&#34;nv&#34;&gt;python&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;==&lt;/span&gt;3.10.12
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;conda activate nemo
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h4 id=&#34;pick-the-right-version&#34;&gt;Pick the right version
&lt;/h4&gt;&lt;p&gt;NeMo-Framework publishes pre-built wheels with each release.
To install nemo_toolkit from such a wheel, use the following installation method:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install &lt;span class=&#34;s2&#34;&gt;&amp;#34;nemo_toolkit[all]&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;If a more specific version is desired, we recommend a Pip-VCS install. From &lt;a class=&#34;link&#34; href=&#34;github.com/NVIDIA/NeMo&#34; &gt;NVIDIA/NeMo&lt;/a&gt;, fetch the commit, branch, or tag that you would like to install.&lt;br&gt;
To install nemo_toolkit from this Git reference &lt;code&gt;$REF&lt;/code&gt;, use the following installation method:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/NVIDIA/NeMo
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; NeMo
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git checkout @&lt;span class=&#34;si&#34;&gt;${&lt;/span&gt;&lt;span class=&#34;nv&#34;&gt;REF&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;:-&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;main&amp;#39;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install &lt;span class=&#34;s1&#34;&gt;&amp;#39;.[all]&amp;#39;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h4 id=&#34;install-a-specific-domain&#34;&gt;Install a specific Domain
&lt;/h4&gt;&lt;p&gt;To install a specific domain of NeMo, you must first install the
nemo_toolkit using the instructions listed above. Then, you run the
following domain-specific commands:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install nemo_toolkit&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;all&amp;#39;&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or pip install &amp;#34;nemo_toolkit[&amp;#39;all&amp;#39;]@git+https://github.com/NVIDIA/NeMo@${REF:-&amp;#39;main&amp;#39;}&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install nemo_toolkit&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;asr&amp;#39;&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or pip install &amp;#34;nemo_toolkit[&amp;#39;asr&amp;#39;]@git+https://github.com/NVIDIA/NeMo@$REF:-&amp;#39;main&amp;#39;}&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install nemo_toolkit&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;nlp&amp;#39;&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or pip install &amp;#34;nemo_toolkit[&amp;#39;nlp&amp;#39;]@git+https://github.com/NVIDIA/NeMo@${REF:-&amp;#39;main&amp;#39;}&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install nemo_toolkit&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;tts&amp;#39;&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or pip install &amp;#34;nemo_toolkit[&amp;#39;tts&amp;#39;]@git+https://github.com/NVIDIA/NeMo@${REF:-&amp;#39;main&amp;#39;}&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install nemo_toolkit&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;vision&amp;#39;&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or pip install &amp;#34;nemo_toolkit[&amp;#39;vision&amp;#39;]@git+https://github.com/NVIDIA/NeMo@${REF:-&amp;#39;main&amp;#39;}&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip install nemo_toolkit&lt;span class=&#34;o&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;multimodal&amp;#39;&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# or pip install &amp;#34;nemo_toolkit[&amp;#39;multimodal&amp;#39;]@git+https://github.com/NVIDIA/NeMo@${REF:-&amp;#39;main&amp;#39;}&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;ngc-pytorch-container&#34;&gt;NGC PyTorch container
&lt;/h3&gt;&lt;p&gt;&lt;strong&gt;NOTE: The following steps are supported beginning with 24.04 (NeMo-Toolkit 2.3.0)&lt;/strong&gt;&lt;/p&gt;
&lt;p&gt;We recommended that you start with a base NVIDIA PyTorch container:
nvcr.io/nvidia/pytorch:25.01-py3.&lt;/p&gt;
&lt;p&gt;If starting with a base NVIDIA PyTorch container, you must first launch
the container:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --gpus all &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -it &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --rm &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --shm-size&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;16g &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --ulimit &lt;span class=&#34;nv&#34;&gt;memlock&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;-1 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --ulimit &lt;span class=&#34;nv&#34;&gt;stack&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;67108864&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  nvcr.io/nvidia/pytorch:&lt;span class=&#34;si&#34;&gt;${&lt;/span&gt;&lt;span class=&#34;nv&#34;&gt;NV_PYTORCH_TAG&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;:-&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;nvcr.io/nvidia/pytorch:25.01-py3&amp;#39;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;From &lt;a class=&#34;link&#34; href=&#34;github.com/NVIDIA/NeMo&#34; &gt;NVIDIA/NeMo&lt;/a&gt;, fetch the commit/branch/tag that you want to install.&lt;br&gt;
To install nemo_toolkit including all of its dependencies from this Git reference &lt;code&gt;$REF&lt;/code&gt;, use the following installation method:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; /opt
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/NVIDIA/NeMo
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; NeMo
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git checkout &lt;span class=&#34;si&#34;&gt;${&lt;/span&gt;&lt;span class=&#34;nv&#34;&gt;REF&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;:-&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;main&amp;#39;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;bash reinstall.sh --library all
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;ngc-nemo-container&#34;&gt;NGC NeMo container
&lt;/h2&gt;&lt;p&gt;NeMo containers are launched concurrently with NeMo version updates.
NeMo Framework now supports LLMs, MMs, ASR, and TTS in a single
consolidated Docker container. You can find additional information about
released containers on the &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/releases&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo releases
page&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;To use a pre-built container, run the following code:&lt;/p&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bash&#34; data-lang=&#34;bash&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;docker run &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --gpus all &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  -it &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --rm &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --shm-size&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;16g &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --ulimit &lt;span class=&#34;nv&#34;&gt;memlock&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;-1 &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  --ulimit &lt;span class=&#34;nv&#34;&gt;stack&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;67108864&lt;/span&gt; &lt;span class=&#34;se&#34;&gt;\
&lt;/span&gt;&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  nvcr.io/nvidia/pytorch:&lt;span class=&#34;si&#34;&gt;${&lt;/span&gt;&lt;span class=&#34;nv&#34;&gt;NV_PYTORCH_TAG&lt;/span&gt;&lt;span class=&#34;k&#34;&gt;:-&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;nvcr.io/nvidia/nemo:25.02&amp;#39;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;future-work&#34;&gt;Future Work
&lt;/h2&gt;&lt;p&gt;The NeMo Framework Launcher does not currently support ASR and TTS
training, but it will soon.&lt;/p&gt;
&lt;h2 id=&#34;discussions-board&#34;&gt;Discussions Board
&lt;/h2&gt;&lt;p&gt;FAQ can be found on the NeMo &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/discussions&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Discussions
board&lt;/a&gt;. You are welcome to
ask questions or start discussions on the board.&lt;/p&gt;
&lt;h2 id=&#34;contribute-to-nemo&#34;&gt;Contribute to NeMo
&lt;/h2&gt;&lt;p&gt;We welcome community contributions! Please refer to
&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/blob/stable/CONTRIBUTING.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;CONTRIBUTING.md&lt;/a&gt;
for the process.&lt;/p&gt;
&lt;h2 id=&#34;publications&#34;&gt;Publications
&lt;/h2&gt;&lt;p&gt;We provide an ever-growing list of
&lt;a class=&#34;link&#34; href=&#34;https://nvidia.github.io/NeMo/publications/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;publications&lt;/a&gt; that utilize
the NeMo Framework.&lt;/p&gt;
&lt;p&gt;To contribute an article to the collection, please submit a pull request
to the &lt;code&gt;gh-pages-src&lt;/code&gt; branch of this repository. For detailed
information, please consult the README located at the &lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo/tree/gh-pages-src#readme&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;gh-pages-src
branch&lt;/a&gt;.&lt;/p&gt;
&lt;h2 id=&#34;blogs&#34;&gt;Blogs
&lt;/h2&gt;&lt;!-- markdownlint-disable --&gt;
&lt;details open&gt;
  &lt;summary&gt;&lt;b&gt;Large Language Models and Multimodal Models&lt;/b&gt;&lt;/summary&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://blogs.nvidia.com/blog/bria-builds-responsible-generative-ai-using-nemo-picasso/&#34;&gt;
          Bria Builds Responsible Generative AI for Enterprises Using NVIDIA NeMo, Picasso
        &lt;/a&gt; (2024/03/06)
      &lt;/summary&gt;
      Bria, a Tel Aviv startup at the forefront of visual generative AI for enterprises now leverages the NVIDIA NeMo Framework. 
      The Bria.ai platform uses reference implementations from the NeMo Multimodal collection, trained on NVIDIA Tensor Core GPUs, to enable high-throughput and low-latency image generation. 
      Bria has also adopted NVIDIA Picasso, a foundry for visual generative AI models, to run inference.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility/&#34;&gt;
          New NVIDIA NeMo Framework Features and NVIDIA H200
        &lt;/a&gt; (2023/12/06)
      &lt;/summary&gt;
      NVIDIA NeMo Framework now includes several optimizations and enhancements, 
      including: 
      1) Fully Sharded Data Parallelism (FSDP) to improve the efficiency of training large-scale AI models, 
      2) Mix of Experts (MoE)-based LLM architectures with expert parallelism for efficient LLM training at scale, 
      3) Reinforcement Learning from Human Feedback (RLHF) with TensorRT-LLM for inference stage acceleration, and 
      4) up to 4.2x speedups for Llama 2 pre-training on NVIDIA H200 Tensor Core GPUs.
      &lt;br&gt;&lt;br&gt;
      &lt;a href=&#34;https://developer.nvidia.com/blog/new-nvidia-nemo-framework-features-and-nvidia-h200-supercharge-llm-training-performance-and-versatility&#34;&gt;
      &lt;img src=&#34;https://github.com/sbhavani/TransformerEngine/blob/main/docs/examples/H200-NeMo-performance.png&#34; alt=&#34;H200-NeMo-performance&#34; style=&#34;width: 600px;&#34;&gt;&lt;/a&gt;
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
    &lt;details&gt;
      &lt;summary&gt;
        &lt;a href=&#34;https://blogs.nvidia.com/blog/nemo-amazon-titan/&#34;&gt;
          NVIDIA now powers training for Amazon Titan Foundation models
        &lt;/a&gt; (2023/11/28)
      &lt;/summary&gt;
      NVIDIA NeMo Framework now empowers the Amazon Titan foundation models (FM) with efficient training of large language models (LLMs). 
      The Titan FMs form the basis of Amazon’s generative AI service, Amazon Bedrock. 
      The NeMo Framework provides a versatile framework for building, customizing, and running LLMs.
      &lt;br&gt;&lt;br&gt;
    &lt;/details&gt;
&lt;/details&gt;
&lt;!-- markdownlint-enable --&gt;
&lt;h2 id=&#34;licenses&#34;&gt;Licenses
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/NVIDIA/NeMo?tab=Apache-2.0-1-ov-file#readme&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NeMo GitHub Apache 2.0
license&lt;/a&gt;&lt;/li&gt;
&lt;li&gt;NeMo is licensed under the &lt;a class=&#34;link&#34; href=&#34;https://www.nvidia.com/en-us/data-center/products/nvidia-ai-enterprise/eula/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;NVIDIA AI PRODUCT
AGREEMENT&lt;/a&gt;.
By pulling and using the container, you accept the terms and
conditions of this license.&lt;/li&gt;
&lt;/ul&gt;
</description>
        </item>
        <item>
        <title>FunASR</title>
        <link>https://producthunt.programnotes.cn/en/p/funasr/</link>
        <pubDate>Wed, 09 Apr 2025 15:29:01 +0800</pubDate>
        
        <guid>https://producthunt.programnotes.cn/en/p/funasr/</guid>
        <description>&lt;img src="https://images.unsplash.com/photo-1695807216937-fddfaa1f63ac?ixid=M3w0NjAwMjJ8MHwxfHJhbmRvbXx8fHx8fHx8fDE3NDQxODM2NTd8&amp;ixlib=rb-4.0.3" alt="Featured image of post FunASR" /&gt;&lt;h1 id=&#34;modelscopefunasr&#34;&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/modelscope/FunASR&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;modelscope/FunASR&lt;/a&gt;
&lt;/h1&gt;&lt;p&gt;(&lt;a class=&#34;link&#34; href=&#34;./README_zh.md&#34; &gt;简体中文&lt;/a&gt;|English)&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://github.com/Akshay090/svg-banners&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://svg-banners.vercel.app/api?type=origin&amp;amp;text1=FunASR%f0%9f%a4%a0&amp;amp;text2=%f0%9f%92%96%20A%20Fundamental%20End-to-End%20Speech%20Recognition%20Toolkit&amp;amp;width=800&amp;amp;height=210&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;SVG Banners&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;https://pypi.org/project/funasr/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;img src=&#34;https://img.shields.io/pypi/v/funasr&#34;
	
	
	
	loading=&#34;lazy&#34;
	
		alt=&#34;PyPI&#34;
	
	
&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p align=&#34;center&#34;&gt;
&lt;a href=&#34;https://trendshift.io/repositories/3839&#34; target=&#34;_blank&#34;&gt;&lt;img src=&#34;https://trendshift.io/api/badge/repositories/3839&#34; alt=&#34;alibaba-damo-academy%2FFunASR | Trendshift&#34; style=&#34;width: 250px; height: 55px;&#34; width=&#34;250&#34; height=&#34;55&#34;/&gt;&lt;/a&gt;
&lt;/p&gt;
&lt;p&gt;&lt;strong&gt;FunASR&lt;/strong&gt; hopes to build a bridge between academic research and industrial applications on speech recognition. By supporting the training &amp;amp; finetuning of the industrial-grade speech recognition model, researchers and developers can conduct research and production of speech recognition models more conveniently, and promote the development of speech recognition ecology. ASR for Fun！&lt;/p&gt;
&lt;p&gt;&lt;a class=&#34;link&#34; href=&#34;#highlights&#34; &gt;&lt;strong&gt;Highlights&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunASR#whats-new&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;News&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;#installation&#34; &gt;&lt;strong&gt;Installation&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;#quick-start&#34; &gt;&lt;strong&gt;Quick Start&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/tutorial/README.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;&lt;strong&gt;Tutorial&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;./runtime/readme.md&#34; &gt;&lt;strong&gt;Runtime&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;#model-zoo&#34; &gt;&lt;strong&gt;Model Zoo&lt;/strong&gt;&lt;/a&gt;
| &lt;a class=&#34;link&#34; href=&#34;#contact&#34; &gt;&lt;strong&gt;Contact&lt;/strong&gt;&lt;/a&gt;&lt;/p&gt;
&lt;p&gt;&lt;a name=&#34;highlights&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;highlights&#34;&gt;Highlights
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;FunASR is a fundamental speech recognition toolkit that offers a variety of features, including speech recognition (ASR), Voice Activity Detection (VAD), Punctuation Restoration, Language Models, Speaker Verification, Speaker Diarization and multi-talker ASR. FunASR provides convenient scripts and tutorials, supporting inference and fine-tuning of pre-trained models.&lt;/li&gt;
&lt;li&gt;We have released a vast collection of academic and industrial pretrained models on the &lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models?page=1&amp;amp;tasks=auto-speech-recognition&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;ModelScope&lt;/a&gt; and &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/FunASR&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;huggingface&lt;/a&gt;, which can be accessed through our &lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Model Zoo&lt;/a&gt;. The representative &lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Paraformer-large&lt;/a&gt;, a non-autoregressive end-to-end speech recognition model, has the advantages of high accuracy, high efficiency, and convenient deployment, supporting the rapid construction of speech recognition services. For more details on service deployment, please refer to the &lt;a class=&#34;link&#34; href=&#34;runtime/readme_cn.md&#34; &gt;service deployment document&lt;/a&gt;.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;&lt;a name=&#34;whats-new&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;whats-new&#34;&gt;What&amp;rsquo;s new:
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;2024/10/29: Real-time Transcription Service 1.12 released, The 2pass-offline mode supports the SensevoiceSmal model；(&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;);&lt;/li&gt;
&lt;li&gt;2024/10/10：Added support for the Whisper-large-v3-turbo model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the &lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/whisper/demo.py&#34; &gt;modelscope&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/whisper/demo_from_openai.py&#34; &gt;openai&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;2024/09/26: Offline File Transcription Service 4.6, Offline File Transcription Service of English 1.7, Real-time Transcription Service 1.11 released, fix memory leak &amp;amp; Support the SensevoiceSmall onnx model；File Transcription Service 2.0 GPU released, Fix GPU memory leak; (&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;);&lt;/li&gt;
&lt;li&gt;2024/09/25：keyword spotting models are new supported. Supports fine-tuning and inference for four models: &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;fsmn_kws&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;fsmn_kws_mt&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-offline&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;sanm_kws&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/speech_sanm_kws_phone-xiaoyun-commands-online&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;sanm_kws_streaming&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;2024/07/04：&lt;a class=&#34;link&#34; href=&#34;https://github.com/FunAudioLLM/SenseVoice&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SenseVoice&lt;/a&gt; is a speech foundation model with multiple speech understanding capabilities, including ASR, LID, SER, and AED.&lt;/li&gt;
&lt;li&gt;2024/07/01: Offline File Transcription Service GPU 1.1 released, optimize BladeDISC model compatibility issues; ref to (&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/06/27: Offline File Transcription Service GPU 1.0 released, supporting dynamic batch processing and multi-threading concurrency. In the long audio test set, the single-thread RTF is 0.0076, and multi-threads&amp;rsquo; speedup is 1200+ (compared to 330+ on CPU); ref to (&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/05/15：emotion recognition models are new supported. &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/emotion2vec_plus_large/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;emotion2vec+large&lt;/a&gt;，&lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/emotion2vec_plus_base/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;emotion2vec+base&lt;/a&gt;，&lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/emotion2vec_plus_seed/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;emotion2vec+seed&lt;/a&gt;. currently supports the following categories: 0: angry 1: happy 2: neutral 3: sad 4: unknown.&lt;/li&gt;
&lt;li&gt;2024/05/15: Offline File Transcription Service 4.5, Offline File Transcription Service of English 1.6, Real-time Transcription Service 1.10 released, adapting to FunASR 1.0 model structure；(&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;/ul&gt;
&lt;details&gt;&lt;summary&gt;Full Changelog&lt;/summary&gt;
&lt;ul&gt;
&lt;li&gt;2024/03/05：Added the Qwen-Audio and Qwen-Audio-Chat large-scale audio-text multimodal models, which have topped multiple audio domain leaderboards. These models support speech dialogue, &lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/qwen_audio&#34; &gt;usage&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;2024/03/05：Added support for the Whisper-large-v3 model, a multitasking model that can perform multilingual speech recognition, speech translation, and language identification. It can be downloaded from the&lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/whisper/demo.py&#34; &gt;modelscope&lt;/a&gt;, and &lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/whisper/demo_from_openai.py&#34; &gt;openai&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;2024/03/05: Offline File Transcription Service 4.4, Offline File Transcription Service of English 1.5，Real-time Transcription Service 1.9 released，docker image supports ARM64 platform, update modelscope；(&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/01/30：funasr-1.0 has been released (&lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunASR/discussions/1319&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/01/30：emotion recognition models are new supported. &lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/iic/emotion2vec_base_finetuned/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;model link&lt;/a&gt;, modified from &lt;a class=&#34;link&#34; href=&#34;https://github.com/ddlBoJack/emotion2vec&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;repo&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;2024/01/25: Offline File Transcription Service 4.2, Offline File Transcription Service of English 1.3 released，optimized the VAD (Voice Activity Detection) data processing method, significantly reducing peak memory usage, memory leak optimization; Real-time Transcription Service 1.7 released，optimizatized the client-side；(&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/01/09: The Funasr SDK for Windows version 2.0 has been released, featuring support for The offline file transcription service (CPU) of Mandarin 4.1, The offline file transcription service (CPU) of English 1.2, The real-time transcription service (CPU) of Mandarin 1.6. For more details, please refer to the official documentation or release notes(&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/damo/funasr-runtime-win-cpu-x64/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FunASR-Runtime-Windows&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/01/03: File Transcription Service 4.0 released, Added support for 8k models, optimized timestamp mismatch issues and added sentence-level timestamps, improved the effectiveness of English word FST hotwords, supported automated configuration of thread parameters, and fixed known crash issues as well as memory leak problems, refer to (&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md#file-transcription-service-mandarin-cpu&#34; &gt;docs&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;2024/01/03: Real-time Transcription Service 1.6 released，The 2pass-offline mode supports Ngram language model decoding and WFST hotwords, while also addressing known crash issues and memory leak problems, (&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md#the-real-time-transcription-service-mandarin-cpu&#34; &gt;docs&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2024/01/03: Fixed known crash issues as well as memory leak problems, (&lt;a class=&#34;link&#34; href=&#34;runtime/readme.md#file-transcription-service-english-cpu&#34; &gt;docs&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;2023/12/04: The Funasr SDK for Windows version 1.0 has been released, featuring support for The offline file transcription service (CPU) of Mandarin, The offline file transcription service (CPU) of English, The real-time transcription service (CPU) of Mandarin. For more details, please refer to the official documentation or release notes(&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/damo/funasr-runtime-win-cpu-x64/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FunASR-Runtime-Windows&lt;/a&gt;)&lt;/li&gt;
&lt;li&gt;2023/11/08: The offline file transcription service 3.0 (CPU) of Mandarin has been released, adding punctuation large model, Ngram language model, and wfst hot words. For detailed information, please refer to &lt;a class=&#34;link&#34; href=&#34;runtime#file-transcription-service-mandarin-cpu&#34; &gt;docs&lt;/a&gt;.&lt;/li&gt;
&lt;li&gt;2023/10/17: The offline file transcription service (CPU) of English has been released. For more details, please refer to (&lt;a class=&#34;link&#34; href=&#34;runtime#file-transcription-service-english-cpu&#34; &gt;docs&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;2023/10/13: &lt;a class=&#34;link&#34; href=&#34;https://slidespeech.github.io/&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;SlideSpeech&lt;/a&gt;: A large scale multi-modal audio-visual corpus with a significant amount of real-time synchronized slides.&lt;/li&gt;
&lt;li&gt;2023/10/10: The ASR-SpeakersDiarization combined pipeline &lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr_vad_spk/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/demo.py&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Paraformer-VAD-SPK&lt;/a&gt; is now released. Experience the model to get recognition results with speaker information.&lt;/li&gt;
&lt;li&gt;2023/10/07: &lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunCodec&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;FunCodec&lt;/a&gt;: A Fundamental, Reproducible and Integrable Open-source Toolkit for Neural Speech Codec.&lt;/li&gt;
&lt;li&gt;2023/09/01: The offline file transcription service 2.0 (CPU) of Mandarin has been released, with added support for ffmpeg, timestamp, and hotword models. For more details, please refer to (&lt;a class=&#34;link&#34; href=&#34;runtime#file-transcription-service-mandarin-cpu&#34; &gt;docs&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;2023/08/07: The real-time transcription service (CPU) of Mandarin has been released. For more details, please refer to (&lt;a class=&#34;link&#34; href=&#34;runtime#the-real-time-transcription-service-mandarin-cpu&#34; &gt;docs&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;2023/07/17: BAT is released, which is a low-latency and low-memory-consumption RNN-T model. For more details, please refer to (&lt;a class=&#34;link&#34; href=&#34;egs/aishell/bat&#34; &gt;BAT&lt;/a&gt;).&lt;/li&gt;
&lt;li&gt;2023/06/26: ASRU2023 Multi-Channel Multi-Party Meeting Transcription Challenge 2.0 completed the competition and announced the results. For more details, please refer to (&lt;a class=&#34;link&#34; href=&#34;https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;M2MeT2.0&lt;/a&gt;).&lt;/li&gt;
&lt;/ul&gt;
&lt;/details&gt;
&lt;p&gt;&lt;a name=&#34;Installation&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;installation&#34;&gt;Installation
&lt;/h2&gt;&lt;ul&gt;
&lt;li&gt;Requirements&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-text&#34; data-lang=&#34;text&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;python&amp;gt;=3.8
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;torch&amp;gt;=1.13
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;torchaudio
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ul&gt;
&lt;li&gt;Install for pypi&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip3 install -U funasr
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ul&gt;
&lt;li&gt;Or install from source code&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-sh&#34; data-lang=&#34;sh&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;git clone https://github.com/alibaba/FunASR.git &lt;span class=&#34;o&#34;&gt;&amp;amp;&amp;amp;&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;cd&lt;/span&gt; FunASR
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip3 install -e ./
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;ul&gt;
&lt;li&gt;Install modelscope or huggingface_hub for the pretrained models (Optional)&lt;/li&gt;
&lt;/ul&gt;
&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;pip3 install -U modelscope huggingface_hub
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h2 id=&#34;model-zoo&#34;&gt;Model Zoo
&lt;/h2&gt;&lt;p&gt;FunASR has open-sourced a large number of pre-trained models on industrial data. You are free to use, copy, modify, and share FunASR models under the &lt;a class=&#34;link&#34; href=&#34;./MODEL_LICENSE&#34; &gt;Model License Agreement&lt;/a&gt;. Below are some representative models, for more models please refer to the &lt;a class=&#34;link&#34; href=&#34;./model_zoo&#34; &gt;Model Zoo&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;(Note: ⭐ represents the ModelScope model zoo, 🤗 represents the Huggingface model zoo, 🍀 represents the OpenAI model zoo)&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Model Name&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Task Details&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Training Data&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;Parameters&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;SenseVoiceSmall &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/iic/SenseVoiceSmall&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/FunAudioLLM/SenseVoiceSmall&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;multiple speech understanding capabilities, including ASR, ITN, LID, SER, and AED, support languages such as zh, yue, en, ja, ko&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;300000 hours&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;234M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;paraformer-zh &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/paraformer-zh&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech recognition, with timestamps, non-streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;60000 hours, Mandarin&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;220M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;nobr&gt;paraformer-zh-streaming &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/paraformer-zh-streaming&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/nobr&gt;&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech recognition, streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;60000 hours, Mandarin&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;220M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;paraformer-en &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/paraformer-en&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech recognition, without timestamps, non-streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;50000 hours, English&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;220M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;conformer-en &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/conformer-en&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech recognition, non-streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;50000 hours, English&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;220M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;ct-punc &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/damo/punc_ct-transformer_cn-en-common-vocab471067-large/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/ct-punc&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;punctuation restoration&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;100M, Mandarin and English&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;290M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;fsmn-vad &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/fsmn-vad&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;voice activity detection&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;5000 hours, Mandarin and English&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;0.4M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;fsmn-kws &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/speech_charctc_kws_phone-xiaoyun/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;keyword spotting，streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;5000 hours, Mandarin&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;0.7M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;fa-zh &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/fa-zh&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;timestamp prediction&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;5000 hours, Mandarin&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;38M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;cam++ &lt;br&gt; ( &lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/speech_campplus_sv_zh-cn_16k-common/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt; &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/funasr/campplus&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speaker verification/diarization&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;5000 hours&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;7.2M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;Whisper-large-v3 &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/iic/Whisper-large-v3/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://github.com/openai/whisper&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🍀&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech recognition, with timestamps, non-streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;multilingual&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;1550 M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;Whisper-large-v3-turbo &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;https://www.modelscope.cn/models/iic/Whisper-large-v3-turbo/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://github.com/openai/whisper&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🍀&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech recognition, with timestamps, non-streaming&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;multilingual&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;809 M&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;Qwen-Audio &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/qwen_audio/demo.py&#34; &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen-Audio&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;audio-text multimodal models (pretraining)&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;multilingual&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;8B&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;Qwen-Audio-Chat &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;examples/industrial_data_pretraining/qwen_audio/demo_chat.py&#34; &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/Qwen/Qwen-Audio-Chat&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;audio-text multimodal models (chat)&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;multilingual&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;8B&lt;/td&gt;
      &lt;/tr&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;emotion2vec+large &lt;br&gt; (&lt;a class=&#34;link&#34; href=&#34;https://modelscope.cn/models/iic/emotion2vec_plus_large/summary&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;⭐&lt;/a&gt;  &lt;a class=&#34;link&#34; href=&#34;https://huggingface.co/emotion2vec/emotion2vec_plus_large&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;🤗&lt;/a&gt; )&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;speech emotion recongintion&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;40000 hours&lt;/td&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;300M&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;p&gt;&lt;a name=&#34;quick-start&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;quick-start&#34;&gt;Quick Start
&lt;/h2&gt;&lt;p&gt;Below is a quick start tutorial. Test audio files (&lt;a class=&#34;link&#34; href=&#34;https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;Mandarin&lt;/a&gt;, &lt;a class=&#34;link&#34; href=&#34;https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;English&lt;/a&gt;).&lt;/p&gt;
&lt;h3 id=&#34;command-line-usage&#34;&gt;Command-line usage
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;funasr ++model&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;paraformer-zh ++vad_model&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;fsmn-vad&amp;#34;&lt;/span&gt; ++punc_model&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;ct-punc&amp;#34;&lt;/span&gt; ++input&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;asr_example_zh.wav
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Notes: Support recognition of single audio file, as well as file list in Kaldi-style wav.scp format: &lt;code&gt;wav_id wav_pat&lt;/code&gt;&lt;/p&gt;
&lt;h3 id=&#34;speech-recognition-non-streaming&#34;&gt;Speech Recognition (Non-streaming)
&lt;/h3&gt;&lt;h4 id=&#34;sensevoice&#34;&gt;SenseVoice
&lt;/h4&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr.utils.postprocess_utils&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;rich_transcription_postprocess&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model_dir&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;iic/SenseVoiceSmall&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_dir&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;vad_model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;fsmn-vad&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;vad_kwargs&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;max_single_segment_time&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;30000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;device&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;cuda:0&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# en&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/en.mp3&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{},&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;language&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;auto&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;# &amp;#34;zn&amp;#34;, &amp;#34;en&amp;#34;, &amp;#34;yue&amp;#34;, &amp;#34;ja&amp;#34;, &amp;#34;ko&amp;#34;, &amp;#34;nospeech&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;use_itn&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;batch_size_s&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;60&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;merge_vad&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;c1&#34;&gt;#&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;merge_length_s&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;15&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;text&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;rich_transcription_postprocess&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;][&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;text&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;])&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;text&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Parameter Description:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;code&gt;model_dir&lt;/code&gt;: The name of the model, or the path to the model on the local disk.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;vad_model&lt;/code&gt;: This indicates the activation of VAD (Voice Activity Detection). The purpose of VAD is to split long audio into shorter clips. In this case, the inference time includes both VAD and SenseVoice total consumption, and represents the end-to-end latency. If you wish to test the SenseVoice model&amp;rsquo;s inference time separately, the VAD model can be disabled.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;vad_kwargs&lt;/code&gt;: Specifies the configurations for the VAD model. &lt;code&gt;max_single_segment_time&lt;/code&gt;: denotes the maximum duration for audio segmentation by the &lt;code&gt;vad_model&lt;/code&gt;, with the unit being milliseconds (ms).&lt;/li&gt;
&lt;li&gt;&lt;code&gt;use_itn&lt;/code&gt;: Whether the output result includes punctuation and inverse text normalization.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;batch_size_s&lt;/code&gt;: Indicates the use of dynamic batching, where the total duration of audio in the batch is measured in seconds (s).&lt;/li&gt;
&lt;li&gt;&lt;code&gt;merge_vad&lt;/code&gt;: Whether to merge short audio fragments segmented by the VAD model, with the merged length being &lt;code&gt;merge_length_s&lt;/code&gt;, in seconds (s).&lt;/li&gt;
&lt;li&gt;&lt;code&gt;ban_emo_unk&lt;/code&gt;: Whether to ban the output of the &lt;code&gt;emo_unk&lt;/code&gt; token.&lt;/li&gt;
&lt;/ul&gt;
&lt;h4 id=&#34;paraformer&#34;&gt;Paraformer
&lt;/h4&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# paraformer-zh is a multi-functional asr model&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# use vad, punc, spk or not as you need&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;paraformer-zh&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;n&#34;&gt;vad_model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;fsmn-vad&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;  &lt;span class=&#34;n&#34;&gt;punc_model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;ct-punc&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;                  &lt;span class=&#34;c1&#34;&gt;# spk_model=&amp;#34;cam++&amp;#34;, &lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;                  &lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/asr_example.wav&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;                     &lt;span class=&#34;n&#34;&gt;batch_size_s&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;300&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; 
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;                     &lt;span class=&#34;n&#34;&gt;hotword&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;魔搭&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Note: &lt;code&gt;hub&lt;/code&gt;: represents the model repository, &lt;code&gt;ms&lt;/code&gt; stands for selecting ModelScope download, &lt;code&gt;hf&lt;/code&gt; stands for selecting Huggingface download.&lt;/p&gt;
&lt;h3 id=&#34;speech-recognition-streaming&#34;&gt;Speech Recognition (Streaming)
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;10&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;5&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;#[0, 10, 5] 600ms, [0, 8, 4] 480ms&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;encoder_chunk_look_back&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;4&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;#number of chunks to lookback for encoder self-attention&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;decoder_chunk_look_back&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;#number of encoder chunks to lookback for decoder cross-attention&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;paraformer-zh-streaming&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;soundfile&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;os&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;os&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;path&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;join&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;example/asr_example.wav&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;speech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sample_rate&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;soundfile&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;read&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;960&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# 600ms&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;total_chunk_num&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;int&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;((&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;speech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;/&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;range&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;total_chunk_num&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;speech_chunk&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;speech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;*&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;*&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;is_final&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;==&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;total_chunk_num&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;-&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;speech_chunk&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;is_final&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;is_final&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;encoder_chunk_look_back&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;encoder_chunk_look_back&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;decoder_chunk_look_back&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;decoder_chunk_look_back&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Note: &lt;code&gt;chunk_size&lt;/code&gt; is the configuration for streaming latency.&lt;code&gt; [0,10,5]&lt;/code&gt; indicates that the real-time display granularity is &lt;code&gt;10*60=600ms&lt;/code&gt;, and the lookahead information is &lt;code&gt;5*60=300ms&lt;/code&gt;. Each inference input is &lt;code&gt;600ms&lt;/code&gt; (sample points are &lt;code&gt;16000*0.6=960&lt;/code&gt;), and the output is the corresponding text. For the last speech segment input, &lt;code&gt;is_final=True&lt;/code&gt; needs to be set to force the output of the last word.&lt;/p&gt;
&lt;details&gt;&lt;summary&gt;More Examples&lt;/summary&gt;
&lt;h3 id=&#34;voice-activity-detection-non-streaming&#34;&gt;Voice Activity Detection (Non-Streaming)
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;fsmn-vad&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/vad_example.wav&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Note: The output format of the VAD model is: &lt;code&gt;[[beg1, end1], [beg2, end2], ..., [begN, endN]]&lt;/code&gt;, where &lt;code&gt;begN/endN&lt;/code&gt; indicates the starting/ending point of the &lt;code&gt;N-th&lt;/code&gt; valid audio segment, measured in milliseconds.&lt;/p&gt;
&lt;h3 id=&#34;voice-activity-detection-streaming&#34;&gt;Voice Activity Detection (Streaming)
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;200&lt;/span&gt; &lt;span class=&#34;c1&#34;&gt;# ms&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;fsmn-vad&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;soundfile&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/vad_example.wav&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;speech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sample_rate&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;soundfile&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;read&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;int&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;*&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;sample_rate&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;/&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;1000&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;{}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;total_chunk_num&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;int&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;((&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;speech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;-&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;/&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;k&#34;&gt;for&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;ow&#34;&gt;in&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;range&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;total_chunk_num&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;speech_chunk&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;speech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;*&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;:(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;i&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;+&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;*&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_stride&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;is_final&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;i&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;==&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;total_chunk_num&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;-&lt;/span&gt; &lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;speech_chunk&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;cache&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;is_final&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;is_final&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;chunk_size&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;    &lt;span class=&#34;k&#34;&gt;if&lt;/span&gt; &lt;span class=&#34;nb&#34;&gt;len&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;0&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;][&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;value&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]):&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;        &lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;Note: The output format for the streaming VAD model can be one of four scenarios:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;&lt;code&gt;[[beg1, end1], [beg2, end2], .., [begN, endN]]&lt;/code&gt;：The same as the offline VAD output result mentioned above.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;[[beg, -1]]&lt;/code&gt;：Indicates that only a starting point has been detected.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;[[-1, end]]&lt;/code&gt;：Indicates that only an ending point has been detected.&lt;/li&gt;
&lt;li&gt;&lt;code&gt;[]&lt;/code&gt;：Indicates that neither a starting point nor an ending point has been detected.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;The output is measured in milliseconds and represents the absolute time from the starting point.&lt;/p&gt;
&lt;h3 id=&#34;punctuation-restoration&#34;&gt;Punctuation Restoration
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;ct-punc&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;那今天的会就到这里吧 happy new year 明年见&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;timestamp-prediction&#34;&gt;Timestamp Prediction
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;fa-zh&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/asr_example.wav&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;text_file&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/text.txt&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;input&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;text_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;),&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;data_type&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;sound&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;text&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;))&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;speech-emotion-recognition&#34;&gt;Speech Emotion Recognition
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;emotion2vec_plus_large&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;sa&#34;&gt;f&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_path&lt;/span&gt;&lt;span class=&#34;si&#34;&gt;}&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;/example/test.wav&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;generate&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;wav_file&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;output_dir&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;./outputs&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;granularity&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;utterance&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;extract_embedding&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;False&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;More usages ref to &lt;a class=&#34;link&#34; href=&#34;docs/tutorial/README_zh.md&#34; &gt;docs&lt;/a&gt;,
more examples ref to &lt;a class=&#34;link&#34; href=&#34;https://github.com/alibaba-damo-academy/FunASR/tree/main/examples/industrial_data_pretraining&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;demo&lt;/a&gt;&lt;/p&gt;
&lt;/details&gt;
&lt;h2 id=&#34;export-onnx&#34;&gt;Export ONNX
&lt;/h2&gt;&lt;h3 id=&#34;command-line-usage-1&#34;&gt;Command-line usage
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-shell&#34; data-lang=&#34;shell&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;funasr-export ++model&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;paraformer ++quantize&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;nb&#34;&gt;false&lt;/span&gt; ++device&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;cpu
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;python&#34;&gt;Python
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;AutoModel&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;paraformer&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;device&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s2&#34;&gt;&amp;#34;cpu&amp;#34;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;res&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;.&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;export&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;quantize&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;False&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;h3 id=&#34;test-onnx&#34;&gt;Test ONNX
&lt;/h3&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt;1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;9
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-python&#34; data-lang=&#34;python&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;c1&#34;&gt;# pip3 install -U funasr-onnx&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;kn&#34;&gt;from&lt;/span&gt; &lt;span class=&#34;nn&#34;&gt;funasr_onnx&lt;/span&gt; &lt;span class=&#34;kn&#34;&gt;import&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Paraformer&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model_dir&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;s2&#34;&gt;&amp;#34;damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch&amp;#34;&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;model&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;Paraformer&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;model_dir&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;batch_size&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;mi&#34;&gt;1&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;quantize&lt;/span&gt;&lt;span class=&#34;o&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;kc&#34;&gt;True&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;wav_path&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;p&#34;&gt;[&lt;/span&gt;&lt;span class=&#34;s1&#34;&gt;&amp;#39;~/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav&amp;#39;&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;]&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;n&#34;&gt;result&lt;/span&gt; &lt;span class=&#34;o&#34;&gt;=&lt;/span&gt; &lt;span class=&#34;n&#34;&gt;model&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;wav_path&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nb&#34;&gt;print&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;(&lt;/span&gt;&lt;span class=&#34;n&#34;&gt;result&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;)&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;&lt;p&gt;More examples ref to &lt;a class=&#34;link&#34; href=&#34;runtime/python/onnxruntime&#34; &gt;demo&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;deployment-service&#34;&gt;Deployment Service
&lt;/h2&gt;&lt;p&gt;FunASR supports deploying pre-trained or further fine-tuned models for service. Currently, it supports the following types of service deployment:&lt;/p&gt;
&lt;ul&gt;
&lt;li&gt;File transcription service, Mandarin, CPU version, done&lt;/li&gt;
&lt;li&gt;The real-time transcription service, Mandarin (CPU), done&lt;/li&gt;
&lt;li&gt;File transcription service, English, CPU version, done&lt;/li&gt;
&lt;li&gt;File transcription service, Mandarin, GPU version, in progress&lt;/li&gt;
&lt;li&gt;and more.&lt;/li&gt;
&lt;/ul&gt;
&lt;p&gt;For more detailed information, please refer to the &lt;a class=&#34;link&#34; href=&#34;runtime/readme.md&#34; &gt;service deployment documentation&lt;/a&gt;.&lt;/p&gt;
&lt;p&gt;&lt;a name=&#34;contact&#34;&gt;&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;community-communication&#34;&gt;Community Communication
&lt;/h2&gt;&lt;p&gt;If you encounter problems in use, you can directly raise Issues on the github page.&lt;/p&gt;
&lt;p&gt;You can also scan the following DingTalk group to join the community group for communication and discussion.&lt;/p&gt;
&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;DingTalk group&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
      &lt;tr&gt;
          &lt;td style=&#34;text-align: center&#34;&gt;&lt;div align=&#34;left&#34;&gt;&lt;img src=&#34;docs/images/dingding.png&#34; width=&#34;250&#34;/&gt;&lt;/td&gt;
      &lt;/tr&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;h2 id=&#34;contributors&#34;&gt;Contributors
&lt;/h2&gt;&lt;table&gt;
  &lt;thead&gt;
      &lt;tr&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;div align=&#34;left&#34;&gt;&lt;img src=&#34;docs/images/alibaba.png&#34; width=&#34;260&#34;/&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;div align=&#34;left&#34;&gt;&lt;img src=&#34;docs/images/nwpu.png&#34; width=&#34;260&#34;/&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;docs/images/China_Telecom.png&#34; width=&#34;200&#34;/&gt; &lt;/div&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;docs/images/RapidAI.png&#34; width=&#34;200&#34;/&gt; &lt;/div&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;docs/images/aihealthx.png&#34; width=&#34;200&#34;/&gt; &lt;/div&gt;&lt;/th&gt;
          &lt;th style=&#34;text-align: center&#34;&gt;&lt;img src=&#34;docs/images/XVERSE.png&#34; width=&#34;250&#34;/&gt; &lt;/div&gt;&lt;/th&gt;
      &lt;/tr&gt;
  &lt;/thead&gt;
  &lt;tbody&gt;
  &lt;/tbody&gt;
&lt;/table&gt;
&lt;p&gt;The contributors can be found in &lt;a class=&#34;link&#34; href=&#34;./Acknowledge.md&#34; &gt;contributors list&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;license&#34;&gt;License
&lt;/h2&gt;&lt;p&gt;This project is licensed under &lt;a class=&#34;link&#34; href=&#34;https://opensource.org/licenses/MIT&#34;  target=&#34;_blank&#34; rel=&#34;noopener&#34;
    &gt;The MIT License&lt;/a&gt;. FunASR also contains various third-party components and some code modified from other repos under other open source licenses.
The use of pretraining model is subject to &lt;a class=&#34;link&#34; href=&#34;./MODEL_LICENSE&#34; &gt;model license&lt;/a&gt;&lt;/p&gt;
&lt;h2 id=&#34;citations&#34;&gt;Citations
&lt;/h2&gt;&lt;div class=&#34;highlight&#34;&gt;&lt;div class=&#34;chroma&#34;&gt;
&lt;table class=&#34;lntable&#34;&gt;&lt;tr&gt;&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code&gt;&lt;span class=&#34;lnt&#34;&gt; 1
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 2
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 3
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 4
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 5
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 6
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 7
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 8
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt; 9
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;10
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;11
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;12
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;13
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;14
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;15
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;16
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;17
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;18
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;19
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;20
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;21
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;22
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;23
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;24
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;25
&lt;/span&gt;&lt;span class=&#34;lnt&#34;&gt;26
&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;
&lt;td class=&#34;lntd&#34;&gt;
&lt;pre tabindex=&#34;0&#34; class=&#34;chroma&#34;&gt;&lt;code class=&#34;language-bibtex&#34; data-lang=&#34;bibtex&#34;&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@inproceedings&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;gao2023funasr&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Zhifu Gao and Zerui Li and Jiaming Wang and Haoneng Luo and Xian Shi and Mengzhe Chen and Yabin Li and Lingyun Zuo and Zhihao Du and Zhangyu Xiao and Shiliang Zhang}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{FunASR: A Fundamental End-to-End Speech Recognition Toolkit}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2023}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;booktitle&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{INTERSPEECH}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@inproceedings&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;An2023bat&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Keyu An and Xian Shi and Shiliang Zhang}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{BAT: Boundary aware transducer for memory-efficient and low-latency ASR}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2023}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;booktitle&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{INTERSPEECH}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@inproceedings&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;gao22b_interspeech&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Zhifu Gao and ShiLiang Zhang and Ian McLoughlin and Zhijie Yan}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;m&#34;&gt;2022&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;booktitle&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Proc. Interspeech 2022}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;pages&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2063--2067}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;doi&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{10.21437/Interspeech.2022-9996}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;nc&#34;&gt;@inproceedings&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;{&lt;/span&gt;&lt;span class=&#34;nl&#34;&gt;shi2023seaco&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;author&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{Xian Shi and Yexin Yang and Zerui Li and Yanni Chen and Zhifu Gao and Shiliang Zhang}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;title&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{SeACo-Paraformer: A Non-Autoregressive ASR System with Flexible and Effective Hotword Customization Ability}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;year&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{2023}&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;,&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;  &lt;span class=&#34;na&#34;&gt;booktitle&lt;/span&gt;&lt;span class=&#34;p&#34;&gt;=&lt;/span&gt;&lt;span class=&#34;s&#34;&gt;{ICASSP2024}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;span class=&#34;line&#34;&gt;&lt;span class=&#34;cl&#34;&gt;&lt;span class=&#34;p&#34;&gt;}&lt;/span&gt;
&lt;/span&gt;&lt;/span&gt;&lt;/code&gt;&lt;/pre&gt;&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;
&lt;/div&gt;
&lt;/div&gt;</description>
        </item>
        
    </channel>
</rss>
