index.html

<!DOCTYPE html>
<html>
<head>
  <meta charset="utf-8">
  <meta name="google-site-verification" content="oZMIcPh6afVajpq9eSwxoKM79HITHoE3mZ46IXmt6D8" />
  <meta name="description"
        content="MotionGPT: Finetuned LLMs are General-Purpose Motion Generators.">
  <meta name="keywords" content="MotionGPT, Motion Generation, LLM">
  <meta name="viewport" content="width=device-width, initial-scale=1">
  <title>MotionGPT: Finetuned LLMs are General-Purpose Motion Generators</title>

  <!-- Global site tag (gtag.js) - Google Analytics -->
  <script async src="https://www.googletagmanager.com/gtag/js?id=G-PYVRSFMDRL"></script>
  <script>
    window.dataLayer = window.dataLayer || [];

    function gtag() {
      dataLayer.push(arguments);
    }

    gtag('js', new Date());

    gtag('config', 'G-PYVRSFMDRL');
  </script>

  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro"
        rel="stylesheet">

  <link rel="stylesheet" href="./static/css/bulma.min.css">
  <link rel="stylesheet" href="./static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="./static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="./static/css/fontawesome.all.min.css">
  <link rel="stylesheet"
        href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="./static/css/index.css">
  <link rel="icon" href="./static/images/dancing-motion-svgrepo-com.svg">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script defer src="./static/js/fontawesome.all.min.js"></script>
  <script src="./static/js/bulma-carousel.min.js"></script>
  <script src="./static/js/bulma-slider.min.js"></script>
  <script src="./static/js/index.js"></script>
</head>
<body>

<section class="hero">
  <div class="hero-body">
    <div class="container is-max-desktop">
      <div class="columns is-centered">
        <div class="column has-text-centered">
          <h1 class="title is-1 publication-title">MotionGPT: Finetuned LLMs are General-Purpose Motion Generators</h1>
          <div class="is-size-5 publication-authors">
            <span class="author-block">
              Yaqi Zhang<sup>1,2</sup>,</span>
            <span class="author-block">
              Di Huang<sup>4</sup>,</span>
            <span class="author-block">
              Bin Liu<sup>1,2</sup>,
            </span>
            <span class="author-block">
              Shixiang Tang<sup>4</sup>,
            </span>
            <span class="author-block">
              Yan Lu<sup>4</sup>,
            </span>
            <span class="author-block">
              Lu Chen<sup>5</sup>,
            </span>
            <span class="author-block">
              Lei Bai<sup>3</sup>,
            </span>
            <span class="author-block">
              Qi Chu<sup>1,2</sup>,
            </span>
            <span class="author-block">
              Nenghai Yu<sup>1,2</sup>,
            </span>
            <span class="author-block">
              Wanli Ouyang<sup>3</sup>
            </span>
          </div>

          <div class="is-size-5 publication-authors">
            <span class="author-block"><sup>1</sup>University of Science and Technology of China</span>
            <span class="author-block"><sup>2</sup>CAS Key Laboratory of Electromagnetic Space Information</span>
            <span class="author-block"><sup>3</sup>Shanghai AI Laboratory</span>
            <span class="author-block"><sup>4</sup>The University of Sydney</span>
            <span class="author-block"><sup>5</sup>Zhejiang University</span>
          </div>

          <div class="column has-text-centered">
            <div class="publication-links">
              <span class="link-block">
                <a href="https://arxiv.org/abs/2306.10900"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="ai ai-arxiv"></i>
                  </span>
                  <span>arXiv</span>
                </a>
              </span>
              <span class="link-block">
                <a href="https://github.com/qiqiApink/MotionGPT"
                   class="external-link button is-normal is-rounded is-dark">
                  <span class="icon">
                      <i class="fab fa-github"></i>
                  </span>
                  <span>Code</span>
                  </a>
              </span>
            </div>

          </div>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <video poster="" id="tree" playsinline autoplay muted loop height="80%">
        <source src="./static/videos/teaser.mp4"
        type="video/mp4">
      </video>
      <h2 class="subtitle has-text-centered">
        MotionGPT supports diverse control conditions for human motion generation by finetuning LLMs.
      </h2>
    </div>
  </div>
</section>

<section class="section">
  <div class="container is-max-desktop">
    <!-- Abstract. -->
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Abstract</h2>
        <div class="content has-text-justified">
          <p>
            Generating realistic human motion from given action descriptions has experienced significant 
            advancements because of the emerging requirement of digital humans. While recent works have 
            achieved impressive results in generating motion directly from textual action descriptions, 
            they often support only a single modality of the control signal, which limits their application 
            in the real digital human industry. This paper presents a <b>Motion G</b>eneral-<b>P</b>urpose genera<b>T</b>or 
            (MotionGPT) that can use multimodal control signals, <i>e.g.</i>, text and single-frame poses, for 
            generating consecutive human motions by treating multimodal signals as special input tokens in 
            large language models (LLMs). Specifically, we first quantize multimodal control signals into 
            discrete codes and then formulate them in a unified prompt instruction to ask the LLMs to generate 
            the motion answer. Our MotionGPT demonstrates a unified human motion generation model with 
            multimodal control signals by tuning a mere 0.4% of LLM parameters. To the best of our knowledge, 
            MotionGPT is the first method to generate human motion by multimodal control signals, which we 
            hope can shed light on this new direction.
          </p>
          <img src="./static/images/teaser.png" alt="Teaser image.">
          <div class="content has-text-justified">
            <p>Compared with previous methods, MotionGPT has the unique ability to accept multiple control conditions and solve various motion generation tasks using a unified model.</p>
          </div>
        </div>
      </div>
    </div>
  </div>
</section>


<section class="section">
  <div class="container is-max-desktop">
    <div class="columns is-centered has-text-centered">
      <div class="column is-four-fifths">
        <h2 class="title is-3">Pipeline</h2>

        <img src="./static/images/pipeline.png" alt="Pipeline image." />
        <div class="content has-text-justified">
          <p>
            Our MotionGPT (<b>Motion G</b>eneral-<b>P</b>urpose genera<b>T</b>or) has 
            the unique ability to accept multiple control conditions and solve various 
            motion generation tasks using a unified model. Given text and poses as an 
            input example, we organize task descriptions (Instruction) and multiple 
            control conditions (Input) within a question template. MotionGPT fine-tunes 
            a LLM with LoRA to generate the corresponding motion answer, which can then 
            be decoded into human motions using a VQ-VAE decoder.
          </p>
        </div>
      </div>
    </div>
  </div>
</section>

<section class="hero is-small">
  <div class="hero-body">
    <div class="container">  
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Text-to-motion Generation</h2>
          <p><b><font face="verdana">the generated motion is in <font color="#f5a623">orange</font></font></b></p>
          <div class="content has-text-justified">
            <div id="results-carousel" class="carousel results-carousel">
              <div class="column is-centered has-text-centered">
                <!-- <h2 class="title is-6">a person walks forward at an angle to the right, then swings their left hand, a person walks forward at an angle to the right, then swings their left hand</h2> -->
                <div style="width:500px ;height:100px;"><p><i>a person walks forward, turns and then sits on a chair</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/t2m_0.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <!-- <h2 class="title is-4">3</h2> -->
                <div style="width:500px ;height:100px;"><p><i>a hunched individual slowly wobbles forward in a drunken manner</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/t2m_1.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <!-- <h2 class="title is-4">4</h2> -->
                <div style="width:500px ;height:100px;"><p><i>a person walks forward at an angle to the right, then swings their left hand</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/t2m_2.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>a person stirs something with his left hand</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/t2m_3.mp4"
                  type="video/mp4">
                </video>
              </div>
            </div>
          </div>
        </div>
      </div>    
    </div>
  </div>
</section>

<section class="hero is-small">
  <div class="hero-body">
    <div class="container">  
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">(Text,initial pose)-to-motion Generation</h2>
          <p><b><font face="verdana">the generated motion is in <font color="#f5a623">orange</font> and we highlight the initial pose in <font color="#4a90e2">blue (remain frozen for 0.5s)</font></font></b></p>
          <div class="content has-text-justified">
            <div id="results-carousel" class="carousel results-carousel">
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>a person slowly walked forward and returned</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/initial_0.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>person is running from side to side</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/initial_1.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p>a person slowly walked forward while balancing</p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/initial_2.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p>a person walks forward very slowly</p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/initial_3.mp4"
                  type="video/mp4">
                </video>
              </div>
            </div>
          </div>
        </div>
      </div>    
    </div>
  </div>
</section>

<section class="hero is-small">
  <div class="hero-body">
    <div class="container">  
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">(Text,last pose)-to-motion Generation</h2>
          <p><b><font face="verdana">the generated motion is in <font color="#f5a623">orange</font> and we highlight the last pose in <font color="#4a90e2">blue (remain frozen for 0.5s)</font></font></b></p>
          <div class="content has-text-justified">
            <div id="results-carousel" class="carousel results-carousel">
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>a person with his arms bent kicks to side with his left foot</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/last_0.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>a person turns right while walking then stops</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/last_1.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>walking backwards and then stopping</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/last_2.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>swinging hands up and down</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/last_3.mp4"
                  type="video/mp4">
                </video>
              </div>
            </div>
          </div>
        </div>
      </div>    
    </div>
  </div>
</section>

<section class="hero is-small">
  <div class="hero-body">
    <div class="container">  
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">(Text,key poses)-to-motion Generation</h2>
          <p><b><font face="verdana">the generated motion is in <font color="#f5a623">orange</font> and we highlight key poses in <font color="#4a90e2">blue (remain frozen for 0.5s)</font></font></b></p>
          <div class="content has-text-justified">
            <div id="results-carousel" class="carousel results-carousel">
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>a walking person suddenly gets staggered to their left, then recovers</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/keys_0.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <!-- <h2 class="title is-4">2</h2> -->
                <div style="width:500px ;height:100px;"><p><i>standing on one leg and swinging it</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/keys_1.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>the man dances around waving his arms and kicking his legs</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/keys_2.mp4"
                  type="video/mp4">
                </video>
              </div>
              <div class="column is-centered has-text-centered">
                <div style="width:500px ;height:100px;"><p><i>a person does multiple jumping jacks</i></p></div>
                <video poster="" id="tree" playsinline autoplay muted loop height="100%">
                  <source src="./static/videos/keys_3.mp4"
                  type="video/mp4">
                </video>
              </div>
            </div>
          </div>
        </div>
      </div>    
    </div>
  </div>
</section>

<section class="section" id="BibTeX">
  <div class="container is-max-desktop content">
    <h2 class="title">BibTeX</h2>
    <pre><code>
    @article{zhang2023motiongpt,
      title={MotionGPT: Finetuned LLMs are General-Purpose Motion Generators},
      author={Zhang, Yaqi and Huang, Di and Liu, Bin and Tang, Shixiang and Lu, Yan and Chen, Lu and Bai, Lei and Chu, Qi and Yu, Nenghai and Ouyang, Wanli},
      journal={arXiv preprint arXiv:2306.10900},
      year={2023}
    }
    </code></pre>
  </div>
</section>


<footer class="footer">
  <div class="container">
    <div class="content has-text-centered">
      <a class="icon-link"
         href="./static/images/motiongpt_paper.pdf">
        <i class="fas fa-file-pdf"></i>
      </a>
      <a class="icon-link" href="https://github.com/qiqiApink/MotionGPT" class="external-link" disabled>
        <i class="fab fa-github"></i>
      </a>
    </div>
    <div class="columns is-centered">
      <div class="column is-8">
        <div class="content">
          <p>
            This website is licensed under a <a rel="license"
                                                href="http://creativecommons.org/licenses/by-sa/4.0/">Creative
            Commons Attribution-ShareAlike 4.0 International License</a>.
          </p>
          <p>
            Website source code based on the <a href="https://nerfies.github.io">Nerfies</a> project page. 
            If you want to reuse their <a href="https://github.com/nerfies/nerfies.github.io">source code</a>, 
            please credit them appropriately.
          </p>
        </div>
      </div>
    </div>
  </div>
</footer>

</body>
</html>