diff --git a/.dev_scripts/build_base_image.sh b/.dev_scripts/build_base_image.sh
deleted file mode 100644
index 8c8c9a0e6..000000000
--- a/.dev_scripts/build_base_image.sh
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/bin/bash
-# default values.
-BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04
-BASE_GPU_CUDA113_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
-BASE_GPU_CUDA117_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.7.1-cudnn8-devel
-BASE_GPU_CUDA118_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.8.0-cudnn8-devel
-MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope
-python_version=3.7.13
-torch_version=1.11.0
-cuda_version=11.7.1
-cudatoolkit_version=11.3
-tensorflow_version=1.15.5
-version=None
-is_cpu=False
-function usage(){
-    echo "usage: build.sh "
-    echo "       --python=python_version set python version, default: $python_version"
-    echo "       --cuda=cuda_version set cuda version,only[11.3.0, 11.7.1], fefault: $cuda_version"
-    echo "       --torch=torch_version set pytorch version, fefault: $torch_version"
-    echo "       --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version"
-    echo "       --test option for run test before push image, only push on ci test pass"
-    echo "       --cpu option for build cpu version"
-    echo "       --push option for push image to remote repo"
-}
-for i in "$@"; do
-  case $i in
-    --python=*)
-      python_version="${i#*=}"
-      shift
-      ;;
-    --cuda=*)
-      cuda_version="${i#*=}"
-      shift # pytorch version
-      ;;
-    --torch=*)
-      torch_version="${i#*=}"
-      shift # pytorch version
-      ;;
-    --tensorflow=*)
-      tensorflow_version="${i#*=}"
-      shift # tensorflow version
-      ;;
-    --version=*)
-      version="${i#*=}"
-      shift # version
-      ;;
-    --cpu)
-      is_cpu=True
-      shift # is cpu image
-      ;;
-    --push)
-      is_push=True
-      shift # option for push image to remote repo
-      ;;
-    --help)
-      usage
-      exit 0
-      ;;
-    -*|--*)
-      echo "Unknown option $i"
-      usage
-      exit 1
-      ;;
-    *)
-      ;;
-  esac
-done
-
-if [ "$cuda_version" == 11.3.0 ]; then
-    echo "Building base image cuda11.3.0"
-    BASE_GPU_IMAGE=$BASE_GPU_CUDA113_IMAGE
-    cudatoolkit_version=cu113
-elif [ "$cuda_version" == 11.7.1 ]; then
-    echo "Building base image cuda11.7.1"
-    cudatoolkit_version=cu117
-    BASE_GPU_IMAGE=$BASE_GPU_CUDA117_IMAGE
-elif [ "$cuda_version" == 11.8.0 ]; then
-    echo "Building base image cuda11.8.0"
-    cudatoolkit_version=cu118
-    BASE_GPU_IMAGE=$BASE_GPU_CUDA118_IMAGE
-else
-    echo "Unsupport cuda version: $cuda_version"
-    exit 1
-fi
-
-if [ "$is_cpu" == "True" ]; then
-    export BASE_IMAGE=$BASE_CPU_IMAGE
-    base_tag=ubuntu20.04
-    export USE_GPU=False
-else
-    export BASE_IMAGE=$BASE_GPU_IMAGE
-    base_tag=ubuntu20.04-cuda$cuda_version
-    export USE_GPU=True
-fi
-if [[ $python_version == 3.7* ]]; then
-    base_tag=$base_tag-py37
-elif [[ $python_version == 3.8* ]]; then
-    base_tag=$base_tag-py38
-else
-    echo "Unsupport python version: $python_version"
-    exit 1
-fi
-
-target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version-base
-export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag
-export PYTHON_VERSION=$python_version
-export TORCH_VERSION=$torch_version
-export CUDATOOLKIT_VERSION=$cudatoolkit_version
-export TENSORFLOW_VERSION=$tensorflow_version
-echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\n"
-docker_file_content=`cat docker/Dockerfile.ubuntu_base`
-printf "$docker_file_content" > Dockerfile
-
-while true
-do
-  docker build -t $IMAGE_TO_BUILD  \
-             --build-arg USE_GPU \
-             --build-arg BASE_IMAGE \
-             --build-arg PYTHON_VERSION \
-             --build-arg TORCH_VERSION \
-             --build-arg CUDATOOLKIT_VERSION \
-             --build-arg TENSORFLOW_VERSION \
-             -f Dockerfile .
-  if [ $? -eq 0 ]; then
-    echo "Image build done"
-    break
-  else
-    echo "Running docker build command error, we will retry"
-  fi
-done
-
-if [ "$is_push" == "True" ]; then
-    echo "Pushing image: $IMAGE_TO_BUILD"
-    docker push $IMAGE_TO_BUILD
-fi
diff --git a/.dev_scripts/build_image.sh b/.dev_scripts/build_image.sh
deleted file mode 100644
index dceaaa22d..000000000
--- a/.dev_scripts/build_image.sh
+++ /dev/null
@@ -1,217 +0,0 @@
-#!/bin/bash
-# default values.
-#BASE_PY37_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py37-torch1.11.0-tf1.15.5-base
-#BASE_PY38_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py38-torch1.11.0-tf1.15.5-base
-#BASE_PY38_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py38-torch1.11.0-tf1.15.5-base
-#BASE_PY38_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-base
-#BASE_PY38_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.7.1-py38-torch1.13.1-tf2.6.0-base
-#BASE_PY37_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-base
-MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope
-python_version=3.7.13
-torch_version=1.11.0
-cudatoolkit_version=11.7
-tensorflow_version=1.15.5
-modelscope_version=None
-cuda_version=11.7.1
-is_ci_test=False
-is_dsw=False
-is_cpu=False
-run_ci_test=False
-function usage(){
-    echo "usage: build.sh "
-    echo "       --python=python_version set python version, default: $python_version"
-    echo "       --cuda=cuda_version set cuda version,only[11.3.0, 11.7.1], fefault: $cuda_version"
-    echo "       --torch=torch_version set pytorch version, fefault: $torch_version"
-    echo "       --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version"
-    echo "       --modelscope=modelscope_version set modelscope version, default: $modelscope_version"
-    echo "       --test option for run test before push image, only push on ci test pass"
-    echo "       --cpu option for build cpu version"
-    echo "       --dsw option for build dsw version"
-    echo "       --ci  option for build ci version"
-    echo "       --push option for push image to remote repo"
-}
-for i in "$@"; do
-  case $i in
-    --python=*)
-      python_version="${i#*=}"
-      shift
-      ;;
-    --cuda=*)
-      cuda_version="${i#*=}"
-      if [ "$cuda_version" == "11.3.0" ]; then
-          cudatoolkit_version=11.3
-      elif [ "$cuda_version" == "11.7.1" ]; then
-          cudatoolkit_version=11.7
-      elif [ "$cuda_version" == "11.8.0" ]; then
-          cudatoolkit_version=11.8
-      else
-          echo "Unsupport cuda version $cuda_version"
-          exit 1
-      fi
-      shift # pytorch version
-      ;;
-    --torch=*)
-      torch_version="${i#*=}"
-      shift # pytorch version
-      ;;
-    --tensorflow=*)
-      tensorflow_version="${i#*=}"
-      shift # tensorflow version
-      ;;
-    --cudatoolkit=*)
-      cudatoolkit_version="${i#*=}"
-      shift # cudatoolkit for pytorch
-      ;;
-    --modelscope=*)
-      modelscope_version="${i#*=}"
-      shift # modelscope version
-      ;;
-    --test)
-      run_ci_test=True
-      shift # will run ci test
-      ;;
-    --cpu)
-      is_cpu=True
-      shift # is cpu image
-      ;;
-    --ci)
-      is_ci_test=True
-      shift # is ci, will not install modelscope
-      ;;
-    --dsw)
-      is_dsw=True
-      shift # is dsw, will set dsw cache location
-      ;;
-    --push)
-      is_push=True
-      shift # option for push image to remote repo
-      ;;
-    --help)
-      usage
-      exit 0
-      ;;
-    -*|--*)
-      echo "Unknown option $i"
-      usage
-      exit 1
-      ;;
-    *)
-      ;;
-  esac
-done
-
-if [ "$modelscope_version" == "None" ]; then
-    echo "ModelScope version must specify!"
-    exit 1
-fi
-if [ "$is_cpu" == "True" ]; then
-    base_tag=ubuntu20.04
-    export USE_GPU=False
-else
-    base_tag=ubuntu20.04-cuda$cuda_version
-    export USE_GPU=True
-fi
-
-if [[ $python_version == 3.7* ]]; then
-    if [ "$is_cpu" == "True" ]; then
-        echo "Building python3.7 cpu image"
-        export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py37-torch$torch_version-tf$tensorflow_version-base
-    else
-        echo "Building python3.7 gpu image"
-        export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda$cuda_version-py37-torch$torch_version-tf$tensorflow_version-base
-    fi
-    base_tag=$base_tag-py37
-elif [[ $python_version == 3.8* ]]; then
-    if [ "$is_cpu" == "True" ]; then
-        echo "Building python3.8 cpu image"
-        export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py38-torch$torch_version-tf$tensorflow_version-base
-    else
-        echo "Building python3.8 gpu image"
-        export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda$cuda_version-py38-torch$torch_version-tf$tensorflow_version-base
-    fi
-    base_tag=$base_tag-py38
-else
-    echo "Unsupport python version: $python_version"
-    exit 1
-fi
-
-target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version
-if [ "$is_ci_test" == "True" ]; then
-    target_image_tag=$target_image_tag-$modelscope_version-ci
-else
-    target_image_tag=$target_image_tag-$modelscope_version-test
-fi
-export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag
-export PYTHON_VERSION=$python_version
-export TORCH_VERSION=$torch_version
-export CUDATOOLKIT_VERSION=$cudatoolkit_version
-export TENSORFLOW_VERSION=$tensorflow_version
-echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n"
-docker_file_content=`cat docker/Dockerfile.ubuntu`
-if [ "$is_ci_test" != "True" ]; then
-    echo "Building ModelScope lib, will install ModelScope lib to image"
-    docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir -U funasr transformers && pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/releases/build/modelscope-$modelscope_version-py3-none-any.whl "
-fi
-echo "$is_dsw"
-if [ "$is_dsw" == "False" ]; then
-    echo "Not DSW image"
-else
-    echo "Building dsw image will need set ModelScope lib cache location."
-    docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope"
-    # pre compile extension
-    docker_file_content="${docker_file_content} \nRUN python -c 'from modelscope.utils.pre_compile import pre_compile_all;pre_compile_all()'"
-    if [ "$is_cpu" == "True" ]; then
-        echo 'build cpu image'
-    else
-        # fix easycv extension and tinycudann conflict.
-        docker_file_content="${docker_file_content} \nRUN bash /tmp/install_tiny_cuda_nn.sh"
-    fi
-fi
-if [ "$is_ci_test" == "True" ]; then
-    echo "Building CI image, uninstall modelscope"
-    docker_file_content="${docker_file_content} \nRUN pip uninstall modelscope -y"
-fi
-printf "$docker_file_content" > Dockerfile
-
-while true
-do
-  docker build -t $IMAGE_TO_BUILD  \
-             --build-arg USE_GPU \
-             --build-arg BASE_IMAGE \
-             --build-arg PYTHON_VERSION \
-             --build-arg TORCH_VERSION \
-             --build-arg CUDATOOLKIT_VERSION \
-             --build-arg TENSORFLOW_VERSION \
-             -f Dockerfile .
-  if [ $? -eq 0 ]; then
-    echo "Image build done"
-    break
-  else
-    echo "Running docker build command error, we will retry"
-  fi
-done
-
-if [ "$run_ci_test" == "True" ]; then
-    echo "Running ci case."
-    export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache
-    export MODELSCOPE_HOME_CACHE=/home/mulin.lyh/ci_case_home # for credential
-    export IMAGE_NAME=$MODELSCOPE_REPO_ADDRESS
-    export IMAGE_VERSION=$target_image_tag
-    export MODELSCOPE_DOMAIN=www.modelscope.cn
-    export HUB_DATASET_ENDPOINT=http://www.modelscope.cn
-    export CI_TEST=True
-    export TEST_LEVEL=1
-    if [ "$is_ci_test" != "True" ]; then
-        echo "Testing for dsw image or MaaS-lib image"
-        export CI_COMMAND="python tests/run.py"
-    fi
-    bash .dev_scripts/dockerci.sh
-    if [ $? -ne 0 ]; then
-       echo "Running unittest failed, please check the log!"
-       exit -1
-    fi
-fi
-if [ "$is_push" == "True" ]; then
-    echo "Pushing image: $IMAGE_TO_BUILD"
-    docker push $IMAGE_TO_BUILD
-fi
diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh
index 0278a7857..4f66073cd 100644
--- a/.dev_scripts/dockerci.sh
+++ b/.dev_scripts/dockerci.sh
@@ -14,6 +14,7 @@ echo "PR modified files: $PR_CHANGED_FILES"
 PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#}
 echo "PR_CHANGED_FILES: $PR_CHANGED_FILES"
 idx=0
+sleep 65
 for gpu in $gpus
 do
   exec {lock_fd}>"/tmp/gpu$gpu" || exit 1
diff --git a/.dev_scripts/run_docker.sh b/.dev_scripts/run_docker.sh
deleted file mode 100644
index 8999458ac..000000000
--- a/.dev_scripts/run_docker.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-#sudo docker run --name zwm_maas -v /home/wenmeng.zwm/workspace:/home/wenmeng.zwm/workspace   --net host -ti reg.docker.alibaba-inc.com/pai-dlc/tensorflow-training:2.3-gpu-py36-cu101-ubuntu18.04 bash
-#sudo docker run --name zwm_maas_pytorch -v /home/wenmeng.zwm/workspace:/home/wenmeng.zwm/workspace --net host  -ti reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 bash
-CONTAINER_NAME=modelscope-dev
-IMAGE_NAME=registry.cn-shanghai.aliyuncs.com/modelscope/modelscope
-IMAGE_VERSION=v0.1.1-16-g62856fa-devel
-MOUNT_DIR=/home/wenmeng.zwm/workspace
-sudo docker run --name  $CONTAINER_NAME -v $MOUNT_DIR:$MOUNT_DIR --net host  -ti ${IMAGE_NAME}:${IMAGE_VERSION} bash
diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index 4fdf7351f..f5a42ca45 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -3,7 +3,7 @@ name: Bug report
 about: Create a bug report to help us improve
 title: ''
 labels: ''
-assignees: Firmament-cyou, tastelikefeet, wangxingjun778, wenmengzhou, zzclynn
+assignees: tastelikefeet, wangxingjun778, yingdachen
 
 ---
 
@@ -36,14 +36,14 @@ A clear and concise description of what the bug is.
 
 Please @ corresponding people according to your problem:
 
-Model related: @wenmengzhou @tastelikefeet
+Model related:  @tastelikefeet
 
-Model hub related: @liuyhwangyh
+Model hub related: @liuyhwangyh @tastelikefeet @wangxingjun778
 
 Dataset releated: @wangxingjun778
 
 Finetune related: @tastelikefeet  @Jintao-Huang
 
-Pipeline related: @Firmament-cyou @wenmengzhou
+Pipeline related: @tastelikefeet @wangxingjun778
 
-Contribute your model: @zzclynn
+Contribute your model: @yingdachen
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
index 0731f3c1f..6eef2aa58 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.md
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -3,7 +3,7 @@ name: Feature request
 about: Suggest an idea for this project
 title: ''
 labels: ''
-assignees: tastelikefeet, wangxingjun778, wenmengzhou, yingdachen, zzclynn
+assignees: yingdachen, wangxingjun778, tastelikefeet
 
 ---
 
diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md
index c7ec72562..3545e5435 100644
--- a/.github/ISSUE_TEMPLATE/question.md
+++ b/.github/ISSUE_TEMPLATE/question.md
@@ -3,7 +3,7 @@ name: Question
 about: Describe this issue template's purpose here.
 title: ''
 labels: ''
-assignees: zzclynn,wenmengzhou
+assignees: tastelikefeet, wangxingjun778, yingdachen
 
 ---
 
@@ -18,7 +18,7 @@ Before asking a question, make sure you have:
 
 Please @ corresponding people according to your problem:
 
-Model related: @wenmengzhou @tastelikefeet
+Model related:  @tastelikefeet
 
 Model hub related: @liuyhwangyh
 
@@ -26,6 +26,6 @@ Dataset releated: @wangxingjun778
 
 Finetune related: @tastelikefeet  @Jintao-Huang
 
-Pipeline related: @Firmament-cyou @wenmengzhou
+Pipeline related: @tastelikefeet @wangxingjun778
 
-Contribute your model: @zzclynn
+Contribute your model: @yingdachen
diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml
new file mode 100644
index 000000000..13f61ff31
--- /dev/null
+++ b/.github/workflows/docker-image.yml
@@ -0,0 +1,54 @@
+name: Build Docker Image
+
+on:
+  workflow_dispatch:
+    inputs:
+      workflow_name:
+        description: 'The specific name of this build'
+        required: true
+        default: 'build'
+      modelscope_branch:
+        description: 'ModelScope branch to build from(release/x.xx)'
+        required: true
+      image_type:
+        description: 'The image type to build(cpu/gpu/llm)'
+        required: true
+      modelscope_version:
+        description: 'ModelScope version to use(x.xx.x)'
+        required: true
+      swift_branch:
+        description: 'SWIFT branch to use(release/x.xx)'
+        required: true
+      ci_image:
+        description: 'Set as the CI image'
+        default: '0'
+        required: false
+      other_params:
+        description: 'Other params in --xxx xxx'
+        required: false
+
+run-name: Docker-${{ inputs.modelscope_branch }}-${{ inputs.image_type }}-${{ inputs.workflow_name }}-by-@${{ github.actor }}
+
+jobs:
+  build:
+    runs-on: [modelscope-self-hosted-us]
+
+    steps:
+    - name: ResetFileMode
+      shell: bash
+      run: |
+        # reset filemode to allow action runner to delete files
+        # generated by root in docker
+        set -e
+        source ~/.bashrc
+        sudo chown -R $USER:$USER  $ACTION_RUNNER_DIR
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        ref: ${{ github.event.inputs.modelscope_branch }}
+
+    - name: Build Docker Image
+      run: |
+        set -e
+        source ~/.bashrc
+        python docker/build_image.py --image_type ${{ github.event.inputs.image_type }} --modelscope_branch ${{ github.event.inputs.modelscope_branch }} --modelscope_version ${{ github.event.inputs.modelscope_version }} --swift_branch ${{ github.event.inputs.swift_branch }} --ci_image ${{ github.event.inputs.ci_image }} ${{ github.event.inputs.other_params }}
diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml
index dc4b5487b..6ff84517d 100644
--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
@@ -11,10 +11,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.7
+      - name: Set up Python 3.10
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: '3.10'
       - name: Install pre-commit hook
         run: |
           pip install pre-commit
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
index 7c2e180a7..dacf6df78 100644
--- a/.github/workflows/publish.yaml
+++ b/.github/workflows/publish.yaml
@@ -15,10 +15,10 @@ jobs:
     #if: startsWith(github.event.ref, 'refs/tags')
     steps:
       - uses: actions/checkout@v2
-      - name: Set up Python 3.7
+      - name: Set up Python 3.10
         uses: actions/setup-python@v2
         with:
-          python-version: '3.7'
+          python-version: '3.10'
       - name: Install wheel
         run: pip install wheel && pip install -r requirements/framework.txt
       - name: Build ModelScope
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7e6e9b774..a8565f16b 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+exclude: 'modelscope/preprocessors/templates/'
+
 repos:
   - repo: https://github.com/pycqa/flake8.git
     rev: 4.0.0
diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml
index a68a5b785..869d8fd66 100644
--- a/.pre-commit-config_local.yaml
+++ b/.pre-commit-config_local.yaml
@@ -1,3 +1,5 @@
+exclude: 'modelscope/preprocessors/templates/'
+
 repos:
   - repo: /home/admin/pre-commit/flake8
     rev: 4.0.0
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
index b23f3150a..7ec11ef0b 100644
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -61,7 +61,7 @@ representative at an online or offline event.
 
 Instances of abusive, harassing, or otherwise unacceptable behavior may be
 reported to the community leaders responsible for enforcement at
-feedback@huggingface.co.
+contact@modelscope.cn.
 All complaints will be reviewed and investigated promptly and fairly.
 
 All community leaders are obligated to respect the privacy and security of the
diff --git a/LICENSE b/LICENSE
index 14cec7de8..d64569567 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,3 @@
-Copyright 2022-2023 Alibaba ModelScope. All rights reserved.
 
                                  Apache License
                            Version 2.0, January 2004
@@ -188,7 +187,7 @@ Copyright 2022-2023 Alibaba ModelScope. All rights reserved.
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright 2020-2022 Alibaba ModelScope.
+   Copyright [yyyy] [name of copyright owner]
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/README.md b/README.md
index dd6d3350e..eb65c053e 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,11 @@
 
 <!-- [![GitHub contributors](https://img.shields.io/github/contributors/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/graphs/contributors/) -->
 <!-- [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) -->
+[Discord](https://discord.gg/FMupRv4jUR)
+
+<h4 align="center">
+<a href="https://trendshift.io/repositories/4784" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4784" alt="modelscope%2Fmodelscope | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</h4>
 
 <h4 align="center">
     <p>
@@ -51,35 +56,36 @@ Hundreds of models are made publicly available on [ModelScope]( https://www.mode
 
 Some representative examples include:
 
-NLP:
+LLM:
 
-* [ChatGLM3-6B](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
+* [Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)
 
-* [Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
+* [Qwen1.5-110B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat/summary)
 
-* [Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
+* [DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary)
 
 * [Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
 
-* [Internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
+* [Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)
 
-* [Udever Multilingual Universal Text Representation Model 1b1](https://modelscope.cn/models/damo/udever-bloom-1b1/summary)
+* [Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)
 
-* [CoROM Text Vector - Chinese - E-commerce Domain - Base](https://modelscope.cn/models/damo/nlp_corom_sentence-embedding_chinese-base-ecom/summary)
-
-* [MGeo Address Similarity Matching Entity Alignment - Chinese - Address Field - Base](https://modelscope.cn/models/damo/mgeo_geographic_entity_alignment_chinese_base/summary)
 
 Multi-Modal:
 
 * [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
 
-* [CogVLM](https://modelscope.cn/models/ZhipuAI/CogVLM/summary)
+* [Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)
+
+* [InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)
 
-* [Text-to-Video Synthesis Large Model - English - General Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)
+* [deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)
 
-* [I2VGen-XL High Definition Image to Video Large Model](https://modelscope.cn/models/damo/Image-to-Video/summary)
+* [OpenSoraPlan](https://modelscope.cn/models/AI-ModelScope/Open-Sora-Plan-v1.0.0/summary)
 
-* [I2VGen-XL High Definition Video to Video Large Model](https://modelscope.cn/models/damo/Video-to-Video/summary)
+* [OpenSora](https://modelscope.cn/models/luchentech/OpenSora-STDiT-v1-HQ-16x512x512/summary)
+
+* [I2VGen-XL](https://modelscope.cn/models/iic/i2vgen-xl/summary)
 
 CV:
 
@@ -293,3 +299,13 @@ We  provide additional documentations including:
 # License
 
 This project is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE).
+
+# Citation
+```
+@Misc{modelscope,
+  title = {ModelScope: bring the notion of Model-as-a-Service to life.},
+  author = {The ModelScope Team},
+  howpublished = {\url{https://github.com/modelscope/modelscope}},
+  year = {2023}
+}
+```
diff --git a/README_ja.md b/README_ja.md
index 4523add49..c3c8c50b7 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -17,6 +17,11 @@
 
 <!-- [![GitHub contributors](https://img.shields.io/github/contributors/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/graphs/contributors/) -->
 <!-- [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) -->
+[Discord](https://discord.gg/FMupRv4jUR)
+
+<h4 align="center">
+<a href="https://trendshift.io/repositories/4784" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4784" alt="modelscope%2Fmodelscope | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</h4>
 
 <h4 align="center">
     <p>
@@ -51,33 +56,36 @@ ModelScope ライブラリは、様々なモデルの実装を保持するだけ
 
 代表的な例をいくつか挙げると:
 
-NLP:
-
-* [nlp_gpt3_text-generation_2.7B](https://modelscope.cn/models/damo/nlp_gpt3_text-generation_2.7B)
+大きなモデル:
 
-* [ChatYuan-large](https://modelscope.cn/models/ClueAI/ChatYuan-large)
+* [Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)
 
-* [mengzi-t5-base](https://modelscope.cn/models/langboat/mengzi-t5-base)
+* [Qwen1.5-110B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat/summary)
 
-* [nlp_csanmt_translation_en2zh](https://modelscope.cn/models/damo/nlp_csanmt_translation_en2zh)
+* [DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary)
 
-* [nlp_raner_named-entity-recognition_chinese-base-news](https://modelscope.cn/models/damo/nlp_raner_named-entity-recognition_chinese-base-news)
+* [Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
 
-* [nlp_structbert_word-segmentation_chinese-base](https://modelscope.cn/models/damo/nlp_structbert_word-segmentation_chinese-base)
+* [Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)
 
-* [Erlangshen-RoBERTa-330M-Sentiment](https://modelscope.cn/models/fengshenbang/Erlangshen-RoBERTa-330M-Sentiment)
+* [Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)
 
-* [nlp_convai_text2sql_pretrain_cn](https://modelscope.cn/models/damo/nlp_convai_text2sql_pretrain_cn)
 
 マルチモーダル:
 
-* [multi-modal_clip-vit-base-patch16_zh](https://modelscope.cn/models/damo/multi-modal_clip-vit-base-patch16_zh)
+* [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
+
+* [Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)
+
+* [InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)
+
+* [deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)
 
-* [ofa_pretrain_base_zh](https://modelscope.cn/models/damo/ofa_pretrain_base_zh)
+* [OpenSoraPlan](https://modelscope.cn/models/AI-ModelScope/Open-Sora-Plan-v1.0.0/summary)
 
-* [Taiyi-Stable-Diffusion-1B-Chinese-v0.1](https://modelscope.cn/models/fengshenbang/Taiyi-Stable-Diffusion-1B-Chinese-v0.1)
+* [OpenSora](https://modelscope.cn/models/luchentech/OpenSora-STDiT-v1-HQ-16x512x512/summary)
 
-* [mplug_visual-question-answering_coco_large_en](https://modelscope.cn/models/damo/mplug_visual-question-answering_coco_large_en)
+* [I2VGen-XL](https://modelscope.cn/models/iic/i2vgen-xl/summary)
 
 CV:
 
diff --git a/README_zh.md b/README_zh.md
index 10b2e7288..9c9e4248f 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -17,30 +17,33 @@
 
 <!-- [![GitHub contributors](https://img.shields.io/github/contributors/modelscope/modelscope.svg)](https://GitHub.com/modelscope/modelscope/graphs/contributors/) -->
 <!-- [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg?style=flat-square)](http://makeapullrequest.com) -->
+[Discord](https://discord.gg/FMupRv4jUR)
+
+<h4 align="center">
+<a href="https://trendshift.io/repositories/4784" target="_blank"><img src="https://trendshift.io/api/badge/repositories/4784" alt="modelscope%2Fmodelscope | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</h4>
 
 <h4 align="center">
     <p>
       <a href="https://github.com/modelscope/modelscope/blob/master/README.md">English</a> |
-       <b>中文</b> |
-        <a href="https://github.com/modelscope/modelscope/blob/master/README_ja.md">日本語</a>
+        <b> 中文 </b> |
+        <a href="https://github.com/modelscope/modelscope/blob/master/README_ja.md"> 日本語 </a>
     <p>
 </h4>
 
-
 </div>
 
 # 简介
 
-[ModelScope]( https://www.modelscope.cn) 是一个“模型即服务”(MaaS)平台，旨在汇集来自AI社区的最先进的机器学习模型，并简化在实际应用中使用AI模型的流程。ModelScope库使开发人员能够通过丰富的API设计执行推理、训练和评估，从而促进跨不同AI领域的最先进模型的统一体验。
-
-ModelScope Library为模型贡献者提供了必要的分层API，以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到ModelScope生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装，用户只需几行代码即可完成模型推理、微调和评估。同时，灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。
-
-除了包含各种模型的实现之外，ModelScope Library还支持与ModelScope后端服务进行必要的交互，特别是与Model-Hub和Dataset-Hub的交互。这种交互促进了模型和数据集的管理在后台无缝执行，包括模型数据集查询、版本控制、缓存管理等。
+[ModelScope](https://www.modelscope.cn) 是一个 “模型即服务”(MaaS) 平台，旨在汇集来自 AI 社区的最先进的机器学习模型，并简化在实际应用中使用 AI 模型的流程。ModelScope 库使开发人员能够通过丰富的 API 设计执行推理、训练和评估，从而促进跨不同 AI 领域的最先进模型的统一体验。
 
+ModelScope Library 为模型贡献者提供了必要的分层 API，以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到 ModelScope 生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装，用户只需几行代码即可完成模型推理、微调和评估。同时，灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。
 
+除了包含各种模型的实现之外，ModelScope Library 还支持与 ModelScope 后端服务进行必要的交互，特别是与 Model-Hub 和 Dataset-Hub 的交互。这种交互促进了模型和数据集的管理在后台无缝执行，包括模型数据集查询、版本控制、缓存管理等。
 
 # 部分模型和在线体验
-ModelScope开源了数百个(当前700+)模型，涵盖自然语言处理、计算机视觉、语音、多模态、科学计算等，其中包含数百个SOTA模型。用户可以进入ModelScope网站([modelscope.cn](http://www.modelscope.cn))的模型中心零门槛在线体验，或者Notebook方式体验模型。
+
+ModelScope 开源了数百个 (当前 700+) 模型，涵盖自然语言处理、计算机视觉、语音、多模态、科学计算等，其中包含数百个 SOTA 模型。用户可以进入 ModelScope 网站 ([modelscope.cn](http://www.modelscope.cn)) 的模型中心零门槛在线体验，或者 Notebook 方式体验模型。
 
 <p align="center">
     <br>
@@ -50,70 +53,65 @@ ModelScope开源了数百个(当前700+)模型，涵盖自然语言处理、计
 
 示例如下:
 
-自然语言处理：
+大模型：
 
-* [ChatGLM3-6B](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary)
+* [Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary)
 
-* [Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary)
+* [Qwen1.5-110B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat/summary)
 
-* [Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary)
+* [DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary)
 
 * [Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary)
 
-* [Internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary)
-
-* [Udever-bloom-1b1](https://modelscope.cn/models/damo/udever-bloom-1b1/summary)
+* [Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary)
 
-* [CoROM文本向量-中文-电商领域-base](https://modelscope.cn/models/damo/nlp_corom_sentence-embedding_chinese-base-ecom/summary)
-
-* [MGeo地址相似度匹配实体对齐-中文-地址领域-base](https://modelscope.cn/models/damo/mgeo_geographic_entity_alignment_chinese_base/summary)
+* [Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary)
 
 多模态：
 
 * [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary)
 
-* [CogVLM](https://modelscope.cn/models/ZhipuAI/CogVLM/summary)
+* [Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary)
+
+* [InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)
 
-* [Text-to-Video Synthesis Large Model - English - General Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary)
+* [deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)
 
-* [I2VGen-XL高清图片到视频大模型](https://modelscope.cn/models/damo/Image-to-Video/summary)
+* [OpenSoraPlan](https://modelscope.cn/models/AI-ModelScope/Open-Sora-Plan-v1.0.0/summary)
 
-* [I2VGen-XL高清视频到视频大模型](https://modelscope.cn/models/damo/Video-to-Video/summary)
+* [OpenSora](https://modelscope.cn/models/luchentech/OpenSora-STDiT-v1-HQ-16x512x512/summary)
 
+* [I2VGen-XL](https://modelscope.cn/models/iic/i2vgen-xl/summary)
 
 计算机视觉：
 
-* [DamoFD人脸检测关键点模型-0.5G](https://modelscope.cn/models/damo/cv_ddsar_face-detection_iclr23-damofd/summary)
+* [DamoFD 人脸检测关键点模型-0.5G](https://modelscope.cn/models/damo/cv_ddsar_face-detection_iclr23-damofd/summary)
 
-* [BSHM人像抠图](https://modelscope.cn/models/damo/cv_unet_image-matting/summary)
+* [BSHM 人像抠图](https://modelscope.cn/models/damo/cv_unet_image-matting/summary)
 
-* [DCT-Net人像卡通化-3D](https://modelscope.cn/models/damo/cv_unet_person-image-cartoon-3d_compound-models/summary)
+* [DCT-Net 人像卡通化-3D](https://modelscope.cn/models/damo/cv_unet_person-image-cartoon-3d_compound-models/summary)
 
-* [DCT-Net人像卡通化模型-3D](https://modelscope.cn/models/damo/face_chain_control_model/summary)
+* [DCT-Net 人像卡通化模型-3D](https://modelscope.cn/models/damo/face_chain_control_model/summary)
 
 * [读光-文字识别-行识别模型-中英-通用领域](https://modelscope.cn/models/damo/cv_convnextTiny_ocr-recognition-general_damo/summary)
 
 * [读光-文字识别-行识别模型-中英-通用领域](https://modelscope.cn/models/damo/cv_resnet18_ocr-detection-line-level_damo/summary)
 
-* [LaMa图像填充](https://modelscope.cn/models/damo/cv_fft_inpainting_lama/summary)
-
-
-
+* [LaMa 图像填充](https://modelscope.cn/models/damo/cv_fft_inpainting_lama/summary)
 
 语音：
 
-* [Paraformer语音识别-中文-通用-16k-离线-大型-长音频版本](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+* [Paraformer 语音识别-中文-通用-16k-离线-大型-长音频版本](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 
-* [FSMN声音端点检测-中文-通用-16k-onnx](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-onnx/summary)
+* [FSMN 声音端点检测-中文-通用-16k-onnx](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-onnx/summary)
 
-* [Monotonic-Aligner语音时间戳预测-16k-离线](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary)
+* [Monotonic-Aligner 语音时间戳预测-16k-离线](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary)
 
-* [CT-Transformer标点-中文-通用-onnx](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx/summary)
+* [CT-Transformer 标点-中文-通用-onnx](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx/summary)
 
 * [语音合成-中文-多情绪领域-16k-多发言人](https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_zh-cn_16k/summary)
 
-* [CAM++说话人验证-中文-通用-200k发言人](https://modelscope.cn/models/damo/speech_campplus_sv_zh-cn_16k-common/summary)
-
+* [CAM++ 说话人验证-中文-通用-200k-发言人](https://modelscope.cn/models/damo/speech_campplus_sv_zh-cn_16k-common/summary)
 
 科学计算：
 
@@ -123,14 +121,15 @@ ModelScope开源了数百个(当前700+)模型，涵盖自然语言处理、计
 
 # 快速上手
 
-我们针对不同任务提供了统一的使用接口， 使用`pipeline`进行模型推理、使用`Trainer`进行微调和评估。
+我们针对不同任务提供了统一的使用接口， 使用 `pipeline` 进行模型推理、使用 `Trainer` 进行微调和评估。
+
+对于任意类型输入（图像、文本、音频、视频...）的任何任务，只需 3 行代码即可加载模型并获得推理结果，如下所示：
 
-对于任意类型输入（图像、文本、音频、视频...）的任何任务，只需3行代码即可加载模型并获得推理结果，如下所示：
 ```python
 >>> from modelscope.pipelines import pipeline
->>> word_segmentation = pipeline('word-segmentation',model='damo/nlp_structbert_word-segmentation_chinese-base')
->>> word_segmentation('今天天气不错，适合出去游玩')
-{'output': '今天 天气 不错 ， 适合 出去 游玩'}
+>>> word_segmentation = pipeline ('word-segmentation',model='damo/nlp_structbert_word-segmentation_chinese-base')
+>>> word_segmentation (' 今天天气不错，适合出去游玩 ')
+{'output': ' 今天 天气 不错 ， 适合 出去 游玩 '}
 ```
 
 给定一张图片，你可以使用如下代码进行人像抠图.
@@ -141,42 +140,44 @@ ModelScope开源了数百个(当前700+)模型，涵盖自然语言处理、计
 >>> import cv2
 >>> from modelscope.pipelines import pipeline
 
->>> portrait_matting = pipeline('portrait-matting')
->>> result = portrait_matting('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matting.png')
->>> cv2.imwrite('result.png', result['output_img'])
+>>> portrait_matting = pipeline ('portrait-matting')
+>>> result = portrait_matting ('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matting.png')
+>>> cv2.imwrite ('result.png', result ['output_img'])
 ```
+
 输出图像如下
 ![image](data/resource/portrait_output.png)
 
-对于微调和评估模型， 你需要通过十多行代码构建dataset和trainer，调用`trainer.train()`和`trainer.evaluate()`即可。
+对于微调和评估模型， 你需要通过十多行代码构建 dataset 和 trainer，调用 `trainer.train ()` 和 `trainer.evaluate ()` 即可。
+
+例如我们利用 gpt3 1.3B 的模型，加载是诗歌数据集进行 finetune，可以完成古诗生成模型的训练。
 
-例如我们利用gpt3 1.3B的模型，加载是诗歌数据集进行finetune，可以完成古诗生成模型的训练。
 ```python
 >>> from modelscope.metainfo import Trainers
 >>> from modelscope.msdatasets import MsDataset
 >>> from modelscope.trainers import build_trainer
 
->>> train_dataset = MsDataset.load('chinese-poetry-collection', split='train'). remap_columns({'text1': 'src_txt'})
->>> eval_dataset = MsDataset.load('chinese-poetry-collection', split='test').remap_columns({'text1': 'src_txt'})
+>>> train_dataset = MsDataset.load ('chinese-poetry-collection', split='train'). remap_columns ({'text1': 'src_txt'})
+>>> eval_dataset = MsDataset.load ('chinese-poetry-collection', split='test').remap_columns ({'text1': 'src_txt'})
 >>> max_epochs = 10
 >>> tmp_dir = './gpt3_poetry'
 
->>> kwargs = dict(
+>>> kwargs = dict (
      model='damo/nlp_gpt3_text-generation_1.3B',
      train_dataset=train_dataset,
      eval_dataset=eval_dataset,
      max_epochs=max_epochs,
      work_dir=tmp_dir)
 
->>> trainer = build_trainer(name=Trainers.gpt3_trainer, default_args=kwargs)
->>> trainer.train()
+>>> trainer = build_trainer (name=Trainers.gpt3_trainer, default_args=kwargs)
+>>> trainer.train ()
 ```
 
-# 为什么要用ModelScope library
+# 为什么要用 ModelScope Library
 
-1. 针对不同任务、不同模型抽象了统一简洁的用户接口，3行代码完成推理，10行代码完成模型训练，方便用户使用ModelScope社区中多个领域的不同模型，开箱即用，便于AI入门和教学。
+1. 针对不同任务、不同模型抽象了统一简洁的用户接口，3 行代码完成推理，10 行代码完成模型训练，方便用户使用 ModelScope 社区中多个领域的不同模型，开箱即用，便于 AI 入门和教学。
 
-2. 构造以模型为中心的开发应用体验，支持模型训练、推理、导出部署，方便用户基于ModelScope Library构建自己的MLOps.
+2. 构造以模型为中心的开发应用体验，支持模型训练、推理、导出部署，方便用户基于 ModelScope Library 构建自己的 MLOps.
 
 3. 针对模型推理、训练流程，进行了模块化的设计，并提供了丰富的功能模块实现，方便用户定制化开发来自定义自己的推理、训练等过程。
 
@@ -185,11 +186,13 @@ ModelScope开源了数百个(当前700+)模型，涵盖自然语言处理、计
 # 安装
 
 ## 镜像
-ModelScope Library目前支持tensorflow，pytorch深度学习框架进行模型训练、推理， 在Python 3.7+, Pytorch 1.8+, Tensorflow1.15/Tensorflow2.0+测试可运行。
 
-为了让大家能直接用上ModelScope平台上的所有模型，无需配置环境，ModelScope提供了官方镜像，方便有需要的开发者获取。地址如下：
+ModelScope Library 目前支持 tensorflow，pytorch 深度学习框架进行模型训练、推理， 在 Python 3.7+, Pytorch 1.8+, Tensorflow1.15/Tensorflow2.0 + 测试可运行。
+
+为了让大家能直接用上 ModelScope 平台上的所有模型，无需配置环境，ModelScope 提供了官方镜像，方便有需要的开发者获取。地址如下：
+
+CPU 镜像
 
-CPU镜像
 ```shell
 # py37
 registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py37-torch1.11.0-tf1.15.5-1.6.1
@@ -198,7 +201,8 @@ registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py37-to
 registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5
 ```
 
-GPU镜像
+GPU 镜像
+
 ```shell
 # py37
 registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.6.1
@@ -207,81 +211,91 @@ registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.
 registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.8.0-py38-torch2.0.1-tf2.13.0-1.9.5
 ```
 
-## 搭建本地Python环境
+## 搭建本地 Python 环境
+
+你也可以使用 pip 和 conda 搭建本地 python 环境，ModelScope 支持 python3.7 + 以上环境，我们推荐使用 [Anaconda](https://docs.anaconda.com/anaconda/install/)，安装完成后，执行如下命令为 modelscope library 创建对应的 python 环境：
 
-你也可以使用pip和conda搭建本地python环境，ModelScope支持python3.7+以上环境，我们推荐使用[Anaconda](https://docs.anaconda.com/anaconda/install/)，安装完成后，执行如下命令为modelscope library创建对应的python环境：
 ```shell
 conda create -n modelscope python=3.8
 conda activate modelscope
 ```
 
 接下来根据所需使用的模型依赖安装底层计算框架
-* 安装Pytorch [文档链接](https://pytorch.org/get-started/locally/)
-* 安装tensorflow [文档链接](https://www.tensorflow.org/install/pip)
 
+* 安装 Pytorch [文档链接](https://pytorch.org/get-started/locally/)
+* 安装 tensorflow [文档链接](https://www.tensorflow.org/install/pip)
+
+安装完前置依赖，你可以按照如下方式安装 ModelScope Library。
 
-安装完前置依赖，你可以按照如下方式安装ModelScope Library。
+ModelScope Libarary 由核心框架，以及不同领域模型的对接组件组成。如果只需要 ModelScope 模型和数据集访问等基础能力，可以只安装 ModelScope 的核心框架：
 
-ModelScope Libarary由核心框架，以及不同领域模型的对接组件组成。如果只需要ModelScope模型和数据集访问等基础能力，可以只安装ModelScope的核心框架：
 ```shell
 pip install modelscope
 ```
 
 如仅需体验多模态领域的模型，可执行如下命令安装领域依赖：
+
 ```shell
-pip install modelscope[multi-modal]
+pip install modelscope [multi-modal]
 ```
 
-如仅需体验NLP领域模型，可执行如下命令安装领域依赖（因部分依赖由ModelScope独立host，所以需要使用"-f"参数）：
+如仅需体验 NLP 领域模型，可执行如下命令安装领域依赖（因部分依赖由 ModelScope 独立 host，所以需要使用 "-f" 参数）：
+
 ```shell
-pip install modelscope[nlp] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install modelscope [nlp] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
-If you want to use cv models:
+如仅需体验计算机视觉领域的模型，可执行如下命令安装领域依赖（因部分依赖由 ModelScope 独立 host，所以需要使用 "-f" 参数）：
+
 ```shell
-pip install modelscope[cv] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install modelscope [cv] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
-如仅需体验语音领域模型，可执行如下命令安装领域依赖（因部分依赖由ModelScope独立host，所以需要使用"-f"参数）：
+如仅需体验语音领域模型，可执行如下命令安装领域依赖（因部分依赖由 ModelScope 独立 host，所以需要使用 "-f" 参数）：
+
 ```shell
-pip install modelscope[audio] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install modelscope [audio] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
-`注意`：当前大部分语音模型需要在Linux环境上使用，并且推荐使用python3.7 + tensorflow 1.x的组合。
+`注意`：当前大部分语音模型需要在 Linux 环境上使用，并且推荐使用 python3.7 + tensorflow 1.x 的组合。
+
+如仅需体验科学计算领域模型，可执行如下命令安装领域依赖（因部分依赖由 ModelScope 独立 host，所以需要使用 "-f" 参数）：
 
-如仅需体验科学计算领域模型，可执行如下命令安装领域依赖（因部分依赖由ModelScope独立host，所以需要使用"-f"参数）：
 ```shell
-pip install modelscope[science] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install modelscope [science] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
-`注`:
-1. 目前部分语音相关的模型仅支持 python3.7,tensorflow1.15.4的Linux环境使用。 其他绝大部分模型可以在windows、mac（x86）上安装使用。.
+`注意`:
+
+1. 目前部分语音相关的模型仅支持 python3.7,tensorflow1.15.4 的 Linux 环境使用。 其他绝大部分模型可以在 windows、mac（x86）上安装使用。
+
+2. 语音领域中一部分模型使用了三方库 SoundFile 进行 wav 文件处理，在 Linux 系统上用户需要手动安装 SoundFile 的底层依赖库 libsndfile，在 Windows 和 MacOS 上会自动安装不需要用户操作。详细信息可参考 [SoundFile 官网](https://github.com/bastibe/python-soundfile#installation)。以 Ubuntu 系统为例，用户需要执行如下命令:
 
-2. 语音领域中一部分模型使用了三方库SoundFile进行wav文件处理，在Linux系统上用户需要手动安装SoundFile的底层依赖库libsndfile，在Windows和MacOS上会自动安装不需要用户操作。详细信息可参考[SoundFile 官网](https://github.com/bastibe/python-soundfile#installation)。以Ubuntu系统为例，用户需要执行如下命令:
     ```shell
     sudo apt-get update
     sudo apt-get install libsndfile1
     ```
 
-3. CV领域的少数模型，需要安装mmcv-full， 如果运行过程中提示缺少mmcv，请参考mmcv[安装手册](https://github.com/open-mmlab/mmcv#installation)进行安装。 这里提供一个最简版的mmcv-full安装步骤，但是要达到最优的mmcv-full的安装效果（包括对于cuda版本的兼容），请根据自己的实际机器环境，以mmcv官方安装手册为准。
+3. CV 领域的少数模型，需要安装 mmcv-full， 如果运行过程中提示缺少 mmcv，请参考 mmcv [安装手册](https://github.com/open-mmlab/mmcv#installation) 进行安装。 这里提供一个最简版的 mmcv-full 安装步骤，但是要达到最优的 mmcv-full 的安装效果（包括对于 cuda 版本的兼容），请根据自己的实际机器环境，以 mmcv 官方安装手册为准。
+
     ```shell
     pip uninstall mmcv # if you have installed mmcv, uninstall it
     pip install -U openmim
     mim install mmcv-full
     ```
 
-
 # 更多教程
 
 除了上述内容，我们还提供如下信息：
+
 * [更加详细的安装文档](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)
 * [任务的介绍](https://modelscope.cn/docs/%E4%BB%BB%E5%8A%A1%E7%9A%84%E4%BB%8B%E7%BB%8D)
 * [模型推理](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline)
 * [模型微调](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train)
 * [数据预处理](https://modelscope.cn/docs/%E6%95%B0%E6%8D%AE%E7%9A%84%E9%A2%84%E5%A4%84%E7%90%86)
 * [模型评估](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AF%84%E4%BC%B0)
-* [贡献模型到ModelScope](https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88)
+* [贡献模型到 ModelScope](https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88)
 
 # License
 
-本项目使用[Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE).
+本项目使用 [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE).
diff --git a/data/test b/data/test
index 77a9ad7fb..dedb3ce44 160000
--- a/data/test
+++ b/data/test
@@ -1 +1 @@
-Subproject commit 77a9ad7fb3cc4bcc99f4a33822c813e7ab473ba0
+Subproject commit dedb3ce44796328b58a2aa47d3434037a9d63c7f
diff --git a/docker/.dockerignore b/docker/.dockerignore
index 14284cb62..0fc13a9b7 100644
--- a/docker/.dockerignore
+++ b/docker/.dockerignore
@@ -1,4 +1,3 @@
-*.sh
 *.md
 *.dockerfile
 *.zip
diff --git a/docker/Dockerfile.extra_install b/docker/Dockerfile.extra_install
new file mode 100644
index 000000000..a815f7123
--- /dev/null
+++ b/docker/Dockerfile.extra_install
@@ -0,0 +1,139 @@
+ENV TZ=Asia/Shanghai
+ENV arch=x86_64
+SHELL ["/bin/bash", "-c"]
+COPY docker/rcfiles /tmp/resources
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall ca-certificates && \
+    apt-get install -y make apt-utils openssh-server locales wget git strace gdb sox libopenmpi-dev curl \
+    iputils-ping net-tools iproute2 autoconf automake gperf libre2-dev libssl-dev \
+    libtool libcurl4-openssl-dev libb64-dev libgoogle-perftools-dev patchelf \
+    rapidjson-dev scons software-properties-common pkg-config unzip zlib1g-dev \
+    libbz2-dev libreadline-dev libsqlite3-dev llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev liblzma-dev \
+    libarchive-dev libxml2-dev libnuma-dev cmake \
+    libgeos-dev strace vim ffmpeg libsm6 tzdata language-pack-zh-hans \
+    ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build \
+    libjpeg-dev libpng-dev && \
+    wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
+    dpkg -i ./git-lfs_3.2.0_amd64.deb && \
+    rm -f ./git-lfs_3.2.0_amd64.deb && \
+    locale-gen zh_CN && \
+    locale-gen zh_CN.utf8 && \
+    update-locale LANG=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 && \
+    ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \
+    dpkg-reconfigure --frontend noninteractive tzdata && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8
+RUN wget -O /tmp/boost.tar.gz https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz && \
+    cd /tmp && tar xzf boost.tar.gz  && \
+    mv /tmp/boost_1_80_0/boost /usr/include/boost && \
+    rm -rf /tmp/boost_1_80_0 && rm -rf boost.tar.gz
+
+#install and config python copy from https://github.com/docker-library/python/blob/1b7a1106674a21e699b155cbd53bf39387284cca/3.10/bookworm/Dockerfile
+ARG PYTHON_VERSION={python_version}
+ENV PATH /usr/local/bin:$PATH
+ENV GPG_KEY A035C8C19219BA821ECEA86B64E628F8D684696D
+ENV PYTHON_VERSION {python_version}
+
+#install and config python copy from https://github.com/docker-library/python/blob/1b7a1106674a21e699b155cbd53bf39387284cca/3.10/bookworm/Dockerfile
+ARG PYTHON_VERSION={python_version}
+ENV PATH /usr/local/bin:$PATH
+ENV GPG_KEY A035C8C19219BA821ECEA86B64E628F8D684696D
+ENV PYTHON_VERSION {python_version}
+
+RUN set -eux; \
+        \
+        wget -O python.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz"; \
+        wget -O python.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc"; \
+        GNUPGHOME="$(mktemp -d)"; export GNUPGHOME; \
+        gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys "$GPG_KEY"; \
+        gpg --batch --verify python.tar.xz.asc python.tar.xz; \
+        gpgconf --kill all; \
+        rm -rf "$GNUPGHOME" python.tar.xz.asc; \
+        mkdir -p /usr/src/python; \
+        tar --extract --directory /usr/src/python --strip-components=1 --file python.tar.xz; \
+        rm python.tar.xz; \
+        \
+        cd /usr/src/python; \
+        gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \
+        ./configure \
+                --build="$gnuArch" \
+                --enable-loadable-sqlite-extensions \
+                --enable-optimizations \
+                --enable-option-checking=fatal \
+                --enable-shared \
+                --with-lto \
+                --with-system-expat \
+                --without-ensurepip \
+        ; \
+        nproc="$(nproc)"; \
+        EXTRA_CFLAGS="$(dpkg-buildflags --get CFLAGS)"; \
+        LDFLAGS="$(dpkg-buildflags --get LDFLAGS)"; \
+        make -j "$nproc" \
+                "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \
+                "LDFLAGS=${LDFLAGS:-}" \
+                "PROFILE_TASK=${PROFILE_TASK:-}" \
+        ; \
+        rm python; \
+        make -j "$nproc" \
+                "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \
+                "LDFLAGS=${LDFLAGS:--Wl},-rpath='\$\$ORIGIN/../lib'" \
+                "PROFILE_TASK=${PROFILE_TASK:-}" \
+                python \
+        ; \
+        make install; \
+        \
+        bin="$(readlink -ve /usr/local/bin/python3)"; \
+        dir="$(dirname "$bin")"; \
+        mkdir -p "/usr/share/gdb/auto-load/$dir"; \
+        cp -vL Tools/gdb/libpython.py "/usr/share/gdb/auto-load/$bin-gdb.py"; \
+        \
+        cd /; \
+        rm -rf /usr/src/python; \
+        \
+        find /usr/local -depth \
+                \( \
+                        \( -type d -a \( -name test -o -name tests -o -name idle_test \) \) \
+                        -o \( -type f -a \( -name '*.pyc' -o -name '*.pyo' -o -name 'libpython*.a' \) \) \
+                \) -exec rm -rf '{}' + \
+        ; \
+        \
+        ldconfig; \
+        \
+        python3 --version
+
+# make some useful symlinks that are expected to exist ("/usr/local/bin/python" and friends)
+RUN set -eux; \
+        for src in idle3 pydoc3 python3 python3-config; do \
+                dst="$(echo "$src" | tr -d 3)"; \
+                [ -s "/usr/local/bin/$src" ]; \
+                [ ! -e "/usr/local/bin/$dst" ]; \
+                ln -svT "$src" "/usr/local/bin/$dst"; \
+        done
+
+# if this is called "PIP_VERSION", pip explodes with "ValueError: invalid truth value '<VERSION>'"
+ENV PYTHON_PIP_VERSION 23.0.1
+# https://github.com/docker-library/python/issues/365
+ENV PYTHON_SETUPTOOLS_VERSION 65.5.1
+# https://github.com/pypa/get-pip
+ENV PYTHON_GET_PIP_URL https://github.com/pypa/get-pip/raw/dbf0c85f76fb6e1ab42aa672ffca6f0a675d9ee4/public/get-pip.py
+ENV PYTHON_GET_PIP_SHA256 dfe9fd5c28dc98b5ac17979a953ea550cec37ae1b47a5116007395bfacff2ab9
+
+RUN set -eux; \
+        \
+        wget -O get-pip.py "$PYTHON_GET_PIP_URL"; \
+        echo "$PYTHON_GET_PIP_SHA256 *get-pip.py" | sha256sum -c -; \
+        \
+        export PYTHONDONTWRITEBYTECODE=1; \
+        \
+        python get-pip.py \
+                --disable-pip-version-check \
+                --no-cache-dir \
+                --no-compile \
+                "pip==$PYTHON_PIP_VERSION" \
+                "setuptools==$PYTHON_SETUPTOOLS_VERSION" \
+        ; \
+        rm -f get-pip.py; \
+        \
+        pip --version
+# end of install python
diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu
index 4ac4fd533..0ec13d124 100644
--- a/docker/Dockerfile.ubuntu
+++ b/docker/Dockerfile.ubuntu
@@ -1,12 +1,30 @@
-ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-base
-FROM $BASE_IMAGE
+FROM {base_image}
 
-RUN apt-get update && apt-get install -y iputils-ping net-tools iproute2 && \
+ARG DEBIAN_FRONTEND=noninteractive
+ENV TZ=Asia/Shanghai
+ENV arch=x86_64
+
+COPY docker/scripts/modelscope_env_init.sh /usr/local/bin/ms_env_init.sh
+RUN apt-get update && \
+    apt-get install -y libsox-dev unzip libaio-dev zip iputils-ping telnet sudo git net-tools && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
-# install modelscope
+
+{extra_content}
+
+COPY {meta_file} /tmp/install.sh
+
+ARG INSTALL_MS_DEPS={install_ms_deps}
+
+# install dependencies
 COPY requirements /var/modelscope
-RUN pip install --no-cache-dir --upgrade pip && \
+
+RUN pip uninstall ms-swift modelscope -y && pip --no-cache-dir install pip==23.* -U && \
+    pip install --no-cache-dir apex -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+if [ "$INSTALL_MS_DEPS" = "True" ]; then \
+    pip --no-cache-dir install omegaconf==2.0.6 && \
+    pip install --no-cache-dir 'cython<=0.29.36' versioneer 'numpy<2.0' -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir kwsbp==0.0.6 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
@@ -14,49 +32,39 @@ RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/science.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
     pip install --no-cache-dir -r /var/modelscope/tests.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
-    pip cache purge
+    pip install --no-cache-dir -r /var/modelscope/server.txt && \
+    pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/packages/imageio_ffmpeg-0.4.9-py3-none-any.whl --no-dependencies --force && \
+    pip install adaseq pai-easycv && \
+    pip install --no-cache-dir 'scipy<1.13.0' && \
+    pip install --no-cache-dir funtextprocessing typeguard==2.13.3 scikit-learn -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip install --no-cache-dir text2sql_lgesql==1.3.0 git+https://github.com/jin-s13/xtcocoapi.git@v1.14 git+https://github.com/gatagat/lap.git@v0.4.0 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html --force --no-deps && \
+    pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 mpi4py paint_ldm ipykernel fasttext -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip cache purge; \
+else \
+    pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    pip cache purge; \
+fi
 
-# install  jupyter plugin
-RUN mkdir -p /root/.local/share/jupyter/labextensions/ && \
-    cp -r  /tmp/resources/jupyter_plugins/*  /root/.local/share/jupyter/labextensions/
+RUN echo "cache bust $(date +%Y%m%d%H%M%S)" && \
+    sh /tmp/install.sh {version_args} && \
+    curl -fsSL https://ollama.com/install.sh | sh && \
+    pip install --no-cache-dir -U funasr scikit-learn && \
+    pip install --no-cache-dir -U qwen_vl_utils pyav librosa timm transformers accelerate peft trl safetensors && \
+    cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b  {modelscope_branch}  --single-branch https://github.com/modelscope/modelscope.git && \
+    cd modelscope && pip install . -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \
+    cd / && rm -fr /tmp/modelscope && pip cache purge; \
+    cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {swift_branch}  --single-branch https://github.com/modelscope/ms-swift.git && \
+    cd ms-swift && pip install .[llm] && \
+    pip install .[eval] && pip install evalscope -U --no-dependencies && pip install xtuner --no-dependencies && \
+    cd / && rm -fr /tmp/ms-swift && pip cache purge; \
+    pip install --no-cache-dir torch=={torch_version} torchvision=={torchvision_version} torchaudio=={torchaudio_version} {index_url} && \
+    pip install --no-cache-dir transformers huggingface-hub==0.25.* -U && pip cache purge; \
+    pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
+    pip config set install.trusted-host mirrors.aliyun.com && \
+    cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list
 
-COPY docker/scripts/modelscope_env_init.sh /usr/local/bin/ms_env_init.sh
-# python3.8 pip install git+https://github.com/jin-s13/xtcocoapi.git@v1.13
-# pip install git+https://github.com/gatagat/lap.git@v0.4.0
-RUN pip install --no-cache-dir text2sql_lgesql==1.3.0 \
-         git+https://github.com/jin-s13/xtcocoapi.git@v1.13 \
-         git+https://github.com/gatagat/lap.git@v0.4.0 \
-         detectron2==0.3 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html --force --no-deps
-
-RUN pip install --no-cache-dir mpi4py paint_ldm \
-         mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 pai-easycv ms_swift \
-         ipykernel fasttext fairseq deepspeed -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-
-ARG USE_GPU
-# for cpu install cpu version faiss, faiss depends on blas lib, we install libopenblas TODO rename gpu or cpu version faiss
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir funtextprocessing kwsbp==0.0.6 faiss==1.7.2 safetensors typeguard==2.13.3 scikit-learn librosa==0.9.2 funasr -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
-    else \
-        pip install --no-cache-dir funtextprocessing kwsbp==0.0.6 https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/faiss-1.7.2-py37-none-linux_x86_64.whl safetensors typeguard==2.13.3 scikit-learn librosa==0.9.2 funasr -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
-    fi
-
-RUN pip install --no-cache-dir wenetruntime==1.11.0 adaseq --no-deps
-COPY examples /modelscope/examples
-
-# for pai-easycv setup compatiblity issue
 ENV SETUPTOOLS_USE_DISTUTILS=stdlib
-
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6" pip install --no-cache-dir  'git+https://github.com/facebookresearch/detectron2.git'; \
-    else \
-        echo 'cpu unsupport detectron2'; \
-    fi
-
-# torchmetrics==0.11.4 for ofa
-RUN pip install --no-cache-dir jupyterlab torchmetrics==0.11.4 tiktoken transformers_stream_generator 'protobuf<=3.20.0' bitsandbytes basicsr
-COPY docker/scripts/install_flash_attension.sh /tmp/install_flash_attension.sh
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        bash /tmp/install_flash_attension.sh; \
-    else \
-        echo 'cpu unsupport flash attention'; \
-    fi
+ENV VLLM_USE_MODELSCOPE=True
+ENV LMDEPLOY_USE_MODELSCOPE=True
+ENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope
+SHELL ["/bin/bash", "-c"]
diff --git a/docker/Dockerfile.ubuntu_base b/docker/Dockerfile.ubuntu_base
index b848e1a12..903c99309 100644
--- a/docker/Dockerfile.ubuntu_base
+++ b/docker/Dockerfile.ubuntu_base
@@ -1,20 +1,20 @@
-ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel
+ARG BASE_IMAGE={base_image}
 FROM $BASE_IMAGE
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=Asia/Shanghai
-ENV CONDA_DIR /opt/conda
-ENV PATH="${CONDA_DIR}/bin:${PATH}"
 ENV arch=x86_64
 SHELL ["/bin/bash", "-c"]
 COPY docker/rcfiles /tmp/resources
-COPY docker/jupyter_plugins /tmp/resources/jupyter_plugins
-RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
-    apt-get clean && \
-    cp /tmp/resources/sources.list.aliyun /etc/apt/sources.list && \
-    apt-get update && \
-    apt-get install -y locales wget git strace gdb sox libopenmpi-dev curl \
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall ca-certificates && \
+    apt-get install -y make apt-utils openssh-server locales wget git strace gdb sox libopenmpi-dev curl \
+    iputils-ping net-tools iproute2 autoconf automake gperf libre2-dev libssl-dev \
+    libtool libcurl4-openssl-dev libb64-dev libgoogle-perftools-dev patchelf \
+    rapidjson-dev scons software-properties-common pkg-config unzip zlib1g-dev \
+    libbz2-dev libreadline-dev libsqlite3-dev llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev liblzma-dev \
+    libarchive-dev libxml2-dev libnuma-dev cmake \
     libgeos-dev strace vim ffmpeg libsm6 tzdata language-pack-zh-hans \
-    ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \
+    ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build \
+    libjpeg-dev libpng-dev && \
     wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \
     dpkg -i ./git-lfs_3.2.0_amd64.deb && \
     rm -f ./git-lfs_3.2.0_amd64.deb && \
@@ -27,118 +27,189 @@ RUN apt-get update && apt-get install -y --reinstall ca-certificates && \
     rm -rf /var/lib/apt/lists/*
 
 ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8
+RUN wget -O /tmp/boost.tar.gz https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz && \
+    cd /tmp && tar xzf boost.tar.gz  && \
+    mv /tmp/boost_1_80_0/boost /usr/include/boost && \
+    rm -rf /tmp/boost_1_80_0 && rm -rf boost.tar.gz
+
+#install and config python copy from https://github.com/docker-library/python/blob/1b7a1106674a21e699b155cbd53bf39387284cca/3.10/bookworm/Dockerfile
+ARG PYTHON_VERSION={python_version}
+ENV PATH /usr/local/bin:$PATH
+ENV GPG_KEY A035C8C19219BA821ECEA86B64E628F8D684696D
+ENV PYTHON_VERSION {python_version}
+
+RUN set -eux; \
+        \
+        wget -O python.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz"; \
+        wget -O python.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc"; \
+        GNUPGHOME="$(mktemp -d)"; export GNUPGHOME; \
+        gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys "$GPG_KEY"; \
+        gpg --batch --verify python.tar.xz.asc python.tar.xz; \
+        gpgconf --kill all; \
+        rm -rf "$GNUPGHOME" python.tar.xz.asc; \
+        mkdir -p /usr/src/python; \
+        tar --extract --directory /usr/src/python --strip-components=1 --file python.tar.xz; \
+        rm python.tar.xz; \
+        \
+        cd /usr/src/python; \
+        gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \
+        ./configure \
+                --build="$gnuArch" \
+                --enable-loadable-sqlite-extensions \
+                --enable-optimizations \
+                --enable-option-checking=fatal \
+                --enable-shared \
+                --with-lto \
+                --with-system-expat \
+                --without-ensurepip \
+        ; \
+        nproc="$(nproc)"; \
+        EXTRA_CFLAGS="$(dpkg-buildflags --get CFLAGS)"; \
+        LDFLAGS="$(dpkg-buildflags --get LDFLAGS)"; \
+        make -j "$nproc" \
+                "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \
+                "LDFLAGS=${LDFLAGS:-}" \
+                "PROFILE_TASK=${PROFILE_TASK:-}" \
+        ; \
+# https://github.com/docker-library/python/issues/784
+# prevent accidental usage of a system installed libpython of the same version
+        rm python; \
+        make -j "$nproc" \
+                "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \
+                "LDFLAGS=${LDFLAGS:--Wl},-rpath='\$\$ORIGIN/../lib'" \
+                "PROFILE_TASK=${PROFILE_TASK:-}" \
+                python \
+        ; \
+        make install; \
+        \
+# enable GDB to load debugging data: https://github.com/docker-library/python/pull/701
+        bin="$(readlink -ve /usr/local/bin/python3)"; \
+        dir="$(dirname "$bin")"; \
+        mkdir -p "/usr/share/gdb/auto-load/$dir"; \
+        cp -vL Tools/gdb/libpython.py "/usr/share/gdb/auto-load/$bin-gdb.py"; \
+        \
+        cd /; \
+        rm -rf /usr/src/python; \
+        \
+        find /usr/local -depth \
+                \( \
+                        \( -type d -a \( -name test -o -name tests -o -name idle_test \) \) \
+                        -o \( -type f -a \( -name '*.pyc' -o -name '*.pyo' -o -name 'libpython*.a' \) \) \
+                \) -exec rm -rf '{}' + \
+        ; \
+        \
+        ldconfig; \
+        \
+        python3 --version
+
+# make some useful symlinks that are expected to exist ("/usr/local/bin/python" and friends)
+RUN set -eux; \
+        for src in idle3 pydoc3 python3 python3-config; do \
+                dst="$(echo "$src" | tr -d 3)"; \
+                [ -s "/usr/local/bin/$src" ]; \
+                [ ! -e "/usr/local/bin/$dst" ]; \
+                ln -svT "$src" "/usr/local/bin/$dst"; \
+        done
 
-#install and config python
-ARG PYTHON_VERSION=3.7.13
-# Miniconda3-py37_23.1.0-1-Linux-x86_64.sh is last python3.7 version
-RUN if [ "$PYTHON_VERSION" = "3.7.13" ] ; then \
-    wget --quiet https://mirrors.aliyun.com/anaconda/miniconda/Miniconda3-py37_23.1.0-1-Linux-x86_64.sh -O ./miniconda.sh && \
-    /bin/bash  miniconda.sh -b -p /opt/conda && \
-    rm  -f miniconda.sh && \
-    ln  -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
-    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    cp /tmp/resources/conda.tuna  ~/.condarc && \
-    source /root/.bashrc && \
-    conda install --yes python==${PYTHON_VERSION} && \
-    pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
-    pip config set install.trusted-host mirrors.aliyun.com;\
-else \
-    wget --quiet https://mirrors.aliyun.com/anaconda/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \
-    /bin/bash  miniconda.sh -b -p /opt/conda && \
-    rm  -f miniconda.sh && \
-    ln  -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \
-    echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \
-    cp /tmp/resources/conda.tuna  ~/.condarc && \
-    source /root/.bashrc && \
-    conda install --yes python==${PYTHON_VERSION} && \
-    pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \
-    pip config set install.trusted-host mirrors.aliyun.com;\
-fi
-
-ARG USE_GPU=True
+# if this is called "PIP_VERSION", pip explodes with "ValueError: invalid truth value '<VERSION>'"
+ENV PYTHON_PIP_VERSION 23.0.1
+# https://github.com/docker-library/python/issues/365
+ENV PYTHON_SETUPTOOLS_VERSION 65.5.1
+# https://github.com/pypa/get-pip
+ENV PYTHON_GET_PIP_URL https://github.com/pypa/get-pip/raw/dbf0c85f76fb6e1ab42aa672ffca6f0a675d9ee4/public/get-pip.py
+ENV PYTHON_GET_PIP_SHA256 dfe9fd5c28dc98b5ac17979a953ea550cec37ae1b47a5116007395bfacff2ab9
+
+RUN set -eux; \
+        \
+        wget -O get-pip.py "$PYTHON_GET_PIP_URL"; \
+        echo "$PYTHON_GET_PIP_SHA256 *get-pip.py" | sha256sum -c -; \
+        \
+        export PYTHONDONTWRITEBYTECODE=1; \
+        \
+        python get-pip.py \
+                --disable-pip-version-check \
+                --no-cache-dir \
+                --no-compile \
+                "pip==$PYTHON_PIP_VERSION" \
+                "setuptools==$PYTHON_SETUPTOOLS_VERSION" \
+        ; \
+        rm -f get-pip.py; \
+        \
+        pip --version
+# end of install python
+
+ARG USE_GPU={use_gpu}
 
 # install pytorch
-ARG TORCH_VERSION=1.12.0
-ARG CUDATOOLKIT_VERSION=cu117
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDATOOLKIT_VERSION; \
-    else \
-        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \
-    fi
+ARG TORCH_VERSION={torch_version}
+ARG CUDATOOLKIT_VERSION={cudatoolkit_version}
 
-# install tensorflow
-ARG TENSORFLOW_VERSION=1.15.5
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        if [ "$TENSORFLOW_VERSION" = "1.15.5" ] ; then \
-            pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
-        else \
-            pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
-         fi \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio; \
     else \
-        # only python 3.7 has tensorflow 1.15.5
-        if [ "$PYTHON_VERSION" = "3.7.13" ] ; then \
-            pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
-        elif [ "$TENSORFLOW_VERSION" = "1.15.5" ] ; then \
-            pip install --no-cache-dir numpy==1.18.5 https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/tensorflow-1.15.5-cp38-cp38-linux_x86_64.whl; \
-        else \
-            pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
-        fi \
+        pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \
     fi
 
-# mmcv-full<=1.7.0 for mmdet3d compatible
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \
-    else \
-        MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \
-    fi
 
 # default shell bash
 ENV SHELL=/bin/bash
 # install special package
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        pip install  dgl -f https://data.dgl.ai/wheels/$CUDATOOLKIT_VERSION/repo.html; \
+        pip install --no-cache-dir dgl -f https://data.dgl.ai/wheels/$CUDATOOLKIT_VERSION/repo.html; \
     else \
-        pip install --no-cache-dir dgl==0.9.0 dglgo -f https://data.dgl.ai/wheels/repo.html; \
+        pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \
     fi
 
 # copy install scripts
 COPY docker/scripts/install_unifold.sh docker/scripts/install_colmap.sh docker/scripts/install_pytorch3d_nvdiffrast.sh docker/scripts/install_tiny_cuda_nn.sh docker/scripts/install_apex.sh /tmp/
 
-# for uniford
-RUN if [ "$USE_GPU" = "True" ] ; then \
-        bash /tmp/install_unifold.sh; \
-    else \
-     echo 'cpu unsupport uniford'; \
-    fi
-
-RUN if [ "$USE_GPU" = "True" ] ; then \
-       export TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6+PTX" && pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \
-    else \
-     echo 'cpu unsupport Pointnet2'; \
-    fi
-
 # 3d supports
 RUN if [ "$USE_GPU" = "True" ] ; then \
         bash /tmp/install_colmap.sh; \
     else \
      echo 'cpu unsupport colmap'; \
     fi
+# install pytorch3d
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        bash /tmp/install_tiny_cuda_nn.sh \
+        bash /tmp/install_pytorch3d_nvdiffrast.sh; \
     else \
-     echo 'cpu unsupport tiny_cudann'; \
+     echo 'cpu unsupport pytorch3d nvdiffrast'; \
     fi
+
+# for uniford
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        bash /tmp/install_pytorch3d_nvdiffrast.sh; \
+        bash /tmp/install_unifold.sh; \
     else \
-     echo 'cpu unsupport pytorch3d nvdiffrast'; \
+     echo 'cpu unsupport uniford'; \
     fi
-# end of 3D
-# install apex after deepspeed
+
 RUN if [ "$USE_GPU" = "True" ] ; then \
-        bash /tmp/install_apex.sh; \
+       export TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.9;9.0;8.6+PTX" && pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \
     else \
-     echo 'cpu unsupport apex'; \
+     echo 'cpu unsupport Pointnet2'; \
     fi
 
+
+ARG TENSORFLOW_VERSION={tf_version}
+ RUN if [ "$USE_GPU" = "True" ] ; then \
+            pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \
+        else \
+            echo 'cpu not install tensorflow'; \
+    fi
+
+ RUN if [ "$USE_GPU" = "True" ] ; then \
+            cd /tmp && git clone -b ms_build --single-branch https://github.com/tastelikefeet/mmcv.git && cd mmcv && TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.9;9.0;8.6+PTX" MMCV_WITH_OPS=1 MAX_JOBS=32 FORCE_CUDA=1 pip install . && cd / && rm -fr /tmp/mmcv && pip cache purge; \
+        else \
+            cd /tmp && git clone -b ms_build --single-branch https://github.com/tastelikefeet/mmcv.git && cd mmcv && MMCV_WITH_OPS=1 MAX_JOBS=32 pip install . && cd / && rm -fr /tmp/mmcv && pip cache purge; \
+    fi
+
+ # This limits the cuda121 version
+ RUN if [ "$USE_GPU" = "True" ] ; then \
+            pip install --no-cache-dir --force tinycudann==1.7  -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \
+        else \
+            echo 'cpu not install tinycudann'; \
+    fi
+
+ RUN pip install --no-cache-dir fairseq
+
 ENTRYPOINT []
diff --git a/docker/build_image.py b/docker/build_image.py
new file mode 100644
index 000000000..344fc9d37
--- /dev/null
+++ b/docker/build_image.py
@@ -0,0 +1,365 @@
+import argparse
+import os
+from datetime import datetime
+from typing import Any
+
+docker_registry = os.environ['DOCKER_REGISTRY']
+assert docker_registry, 'You must pass a valid DOCKER_REGISTRY'
+timestamp = datetime.now()
+formatted_time = timestamp.strftime('%Y%m%d%H%M%S')
+
+
+class Builder:
+
+    def __init__(self, args: Any, dry_run: bool):
+        self.args = self.init_args(args)
+        self.dry_run = dry_run
+        self.args.cudatoolkit_version = self._generate_cudatoolkit_version(
+            args.cuda_version)
+        self.args.python_tag = self._generate_python_tag(args.python_version)
+
+    def init_args(self, args: Any) -> Any:
+        if not args.base_image:
+            # A mirrored image of nvidia/cuda:12.4.0-devel-ubuntu22.04
+            args.base_image = 'nvidia/cuda:12.1.0-devel-ubuntu22.04'
+        if not args.torch_version:
+            args.torch_version = '2.3.1'
+            args.torchaudio_version = '2.3.1'
+            args.torchvision_version = '0.18.1'
+        if not args.tf_version:
+            args.tf_version = '2.16.1'
+        if not args.cuda_version:
+            args.cuda_version = '12.1.0'
+        if not args.vllm_version:
+            args.vllm_version = '0.5.3'
+        if not args.lmdeploy_version:
+            args.lmdeploy_version = '0.6.2'
+        if not args.autogptq_version:
+            args.autogptq_version = '0.7.1'
+        return args
+
+    def _generate_cudatoolkit_version(self, cuda_version: str) -> str:
+        cuda_version = cuda_version[:cuda_version.rfind('.')]
+        return 'cu' + cuda_version.replace('.', '')
+
+    def _generate_python_tag(self, python_version: str) -> str:
+        python_version = python_version[:python_version.rfind('.')]
+        return 'py' + python_version.replace('.', '')
+
+    def generate_dockerfile(self) -> str:
+        raise NotImplementedError
+
+    def _save_dockerfile(self, content: str) -> None:
+        if os.path.exists('./Dockerfile'):
+            os.remove('./Dockerfile')
+        with open('./Dockerfile', 'w') as f:
+            f.write(content)
+
+    def build(self) -> int:
+        pass
+
+    def push(self) -> int:
+        pass
+
+    def image(self) -> str:
+        pass
+
+    def __call__(self):
+        content = self.generate_dockerfile()
+        self._save_dockerfile(content)
+        if not self.dry_run:
+            ret = self.build()
+            if ret != 0:
+                raise RuntimeError(f'Docker build error with errno: {ret}')
+
+            ret = self.push()
+            if ret != 0:
+                raise RuntimeError(f'Docker push error with errno: {ret}')
+
+            if self.args.ci_image != 0:
+                ret = os.system(
+                    f'docker tag {self.image()} {docker_registry}:ci_image')
+                if ret != 0:
+                    raise RuntimeError(
+                        f'Docker tag ci_image error with errno: {ret}')
+
+
+class BaseCPUImageBuilder(Builder):
+
+    def generate_dockerfile(self) -> str:
+        with open('docker/Dockerfile.ubuntu_base', 'r') as f:
+            content = f.read()
+        content = content.replace('{base_image}', self.args.base_image)
+        content = content.replace('{use_gpu}', 'False')
+        content = content.replace('{python_version}', self.args.python_version)
+        content = content.replace('{torch_version}', self.args.torch_version)
+        content = content.replace('{cudatoolkit_version}',
+                                  self.args.cudatoolkit_version)
+        content = content.replace('{tf_version}', self.args.tf_version)
+        return content
+
+    def image(self) -> str:
+        return (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-base')
+
+    def build(self):
+        return os.system(
+            f'DOCKER_BUILDKIT=0 docker build -t {self.image()} -f Dockerfile .'
+        )
+
+    def push(self):
+        return os.system(f'docker push {self.image()}')
+
+
+class BaseGPUImageBuilder(Builder):
+
+    def generate_dockerfile(self) -> str:
+        with open('docker/Dockerfile.ubuntu_base', 'r') as f:
+            content = f.read()
+        content = content.replace('{base_image}', self.args.base_image)
+        content = content.replace('{use_gpu}', 'True')
+        content = content.replace('{python_version}', self.args.python_version)
+        content = content.replace('{torch_version}', self.args.torch_version)
+        content = content.replace('{cudatoolkit_version}',
+                                  self.args.cudatoolkit_version)
+        content = content.replace('{tf_version}', self.args.tf_version)
+        return content
+
+    def image(self) -> str:
+        return (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
+
+    def build(self) -> int:
+        return os.system(
+            f'DOCKER_BUILDKIT=0 docker build -t {self.image()} -f Dockerfile .'
+        )
+
+    def push(self):
+        return os.system(f'docker push {self.image()}')
+
+
+class CPUImageBuilder(Builder):
+
+    def generate_dockerfile(self) -> str:
+        meta_file = './docker/install_cpu.sh'
+        version_args = (
+            f'{self.args.torch_version} {self.args.torchvision_version} '
+            f'{self.args.torchaudio_version}')
+        base_image = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}'
+            f'-torch{self.args.torch_version}-base')
+        extra_content = ''
+
+        with open('docker/Dockerfile.ubuntu', 'r') as f:
+            content = f.read()
+            content = content.replace('{base_image}', base_image)
+            content = content.replace('{extra_content}', extra_content)
+            content = content.replace('{meta_file}', meta_file)
+            content = content.replace('{version_args}', version_args)
+            content = content.replace('{install_ms_deps}', 'True')
+            content = content.replace('{torch_version}',
+                                      self.args.torch_version)
+            content = content.replace('{torchvision_version}',
+                                      self.args.torchvision_version)
+            content = content.replace('{torchaudio_version}',
+                                      self.args.torchaudio_version)
+            content = content.replace(
+                '{index_url}',
+                '--index-url https://download.pytorch.org/whl/cpu')
+            content = content.replace('{modelscope_branch}',
+                                      self.args.modelscope_branch)
+            content = content.replace('{swift_branch}', self.args.swift_branch)
+        return content
+
+    def image(self) -> str:
+        return (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-{self.args.modelscope_version}-test'
+        )
+
+    def build(self) -> int:
+        return os.system(f'docker build -t {self.image()} -f Dockerfile .')
+
+    def push(self):
+        ret = os.system(f'docker push {self.image()}')
+        if ret != 0:
+            return ret
+        image_tag2 = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-{self.args.modelscope_version}-{formatted_time}-test'
+        )
+        ret = os.system(f'docker tag {self.image()} {image_tag2}')
+        if ret != 0:
+            return ret
+        return os.system(f'docker push {image_tag2}')
+
+
+class GPUImageBuilder(Builder):
+
+    def generate_dockerfile(self) -> str:
+        meta_file = './docker/install.sh'
+        extra_content = """
+RUN pip install tf-keras==2.16.0 --no-dependencies && \
+    pip install --no-cache-dir torchsde jupyterlab torchmetrics==0.11.4 basicsr pynvml shortuuid && \
+    CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0" \
+        pip install --no-cache-dir  'git+https://github.com/facebookresearch/detectron2.git'
+"""
+
+        version_args = (
+            f'{self.args.torch_version} {self.args.torchvision_version} {self.args.torchaudio_version} '
+            f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}'
+        )
+        base_image = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-'
+            f'torch{self.args.torch_version}-tf{self.args.tf_version}-base')
+        with open('docker/Dockerfile.ubuntu', 'r') as f:
+            content = f.read()
+            content = content.replace('{base_image}', base_image)
+            content = content.replace('{extra_content}', extra_content)
+            content = content.replace('{meta_file}', meta_file)
+            content = content.replace('{version_args}', version_args)
+            content = content.replace('{install_ms_deps}', 'True')
+            content = content.replace('{torch_version}',
+                                      self.args.torch_version)
+            content = content.replace('{torchvision_version}',
+                                      self.args.torchvision_version)
+            content = content.replace('{torchaudio_version}',
+                                      self.args.torchaudio_version)
+            content = content.replace('{index_url}', '')
+            content = content.replace('{modelscope_branch}',
+                                      self.args.modelscope_branch)
+            content = content.replace('{swift_branch}', self.args.swift_branch)
+        return content
+
+    def image(self) -> str:
+        return (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
+            f'{self.args.modelscope_version}-test')
+
+    def build(self) -> int:
+        return os.system(f'docker build -t {self.image()} -f Dockerfile .')
+
+    def push(self):
+        ret = os.system(f'docker push {self.image()}')
+        if ret != 0:
+            return ret
+        image_tag2 = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-'
+            f'{self.args.modelscope_version}-{formatted_time}-test')
+        ret = os.system(f'docker tag {self.image()} {image_tag2}')
+        if ret != 0:
+            return ret
+        return os.system(f'docker push {image_tag2}')
+
+
+class LLMImageBuilder(Builder):
+
+    def init_args(self, args) -> Any:
+        if not args.base_image:
+            # A mirrored image of nvidia/cuda:12.4.0-devel-ubuntu22.04
+            args.base_image = 'nvidia/cuda:12.4.0-devel-ubuntu22.04'
+        if not args.torch_version:
+            args.torch_version = '2.4.0'
+            args.torchaudio_version = '2.4.0'
+            args.torchvision_version = '0.19.0'
+        if not args.cuda_version:
+            args.cuda_version = '12.4.0'
+        if not args.vllm_version:
+            args.vllm_version = '0.6.3.post1'
+        if not args.lmdeploy_version:
+            args.lmdeploy_version = '0.6.2'
+        if not args.autogptq_version:
+            args.autogptq_version = '0.7.1'
+        return args
+
+    def generate_dockerfile(self) -> str:
+        meta_file = './docker/install.sh'
+        with open('docker/Dockerfile.extra_install', 'r') as f:
+            extra_content = f.read()
+            extra_content = extra_content.replace('{python_version}',
+                                                  self.args.python_version)
+        version_args = (
+            f'{self.args.torch_version} {self.args.torchvision_version} {self.args.torchaudio_version} '
+            f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}'
+        )
+        with open('docker/Dockerfile.ubuntu', 'r') as f:
+            content = f.read()
+            content = content.replace('{base_image}', self.args.base_image)
+            content = content.replace('{extra_content}', extra_content)
+            content = content.replace('{meta_file}', meta_file)
+            content = content.replace('{version_args}', version_args)
+            content = content.replace('{install_ms_deps}', 'False')
+            content = content.replace('{torch_version}',
+                                      self.args.torch_version)
+            content = content.replace('{torchvision_version}',
+                                      self.args.torchvision_version)
+            content = content.replace('{torchaudio_version}',
+                                      self.args.torchaudio_version)
+            content = content.replace('{index_url}', '')
+            content = content.replace('{modelscope_branch}',
+                                      self.args.modelscope_branch)
+            content = content.replace('{swift_branch}', self.args.swift_branch)
+        return content
+
+    def image(self) -> str:
+        return (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{self.args.python_tag}-torch{self.args.torch_version}-{self.args.modelscope_version}-LLM-test'
+        )
+
+    def build(self) -> int:
+        return os.system(f'docker build -t {self.image()} -f Dockerfile .')
+
+    def push(self):
+        ret = os.system(f'docker push {self.image()}')
+        if ret != 0:
+            return ret
+        image_tag2 = (
+            f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-'
+            f'{self.args.python_tag}-torch{self.args.torch_version}-'
+            f'{self.args.modelscope_version}-LLM-{formatted_time}-test')
+        ret = os.system(f'docker tag {self.image()} {image_tag2}')
+        if ret != 0:
+            return ret
+        return os.system(f'docker push {image_tag2}')
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--base_image', type=str, default=None)
+parser.add_argument('--image_type', type=str)
+parser.add_argument('--python_version', type=str, default='3.10.14')
+parser.add_argument('--ubuntu_version', type=str, default='22.04')
+parser.add_argument('--torch_version', type=str, default=None)
+parser.add_argument('--torchvision_version', type=str, default=None)
+parser.add_argument('--cuda_version', type=str, default=None)
+parser.add_argument('--ci_image', type=int, default=0)
+parser.add_argument('--torchaudio_version', type=str, default=None)
+parser.add_argument('--tf_version', type=str, default=None)
+parser.add_argument('--vllm_version', type=str, default=None)
+parser.add_argument('--lmdeploy_version', type=str, default=None)
+parser.add_argument('--autogptq_version', type=str, default=None)
+parser.add_argument('--modelscope_branch', type=str, default='master')
+parser.add_argument('--modelscope_version', type=str, default='9.99.0')
+parser.add_argument('--swift_branch', type=str, default='main')
+parser.add_argument('--dry_run', type=int, default=0)
+
+args = parser.parse_args()
+
+if args.image_type.lower() == 'base_cpu':
+    builder_cls = BaseCPUImageBuilder
+elif args.image_type.lower() == 'base_gpu':
+    builder_cls = BaseGPUImageBuilder
+elif args.image_type.lower() == 'cpu':
+    builder_cls = CPUImageBuilder
+elif args.image_type.lower() == 'gpu':
+    builder_cls = GPUImageBuilder
+elif args.image_type.lower() == 'llm':
+    builder_cls = LLMImageBuilder
+else:
+    raise ValueError(f'Unsupported image_type: {args.image_type}')
+
+builder_cls(args, args.dry_run)()
diff --git a/docker/install.sh b/docker/install.sh
new file mode 100644
index 000000000..3a6ffc13e
--- /dev/null
+++ b/docker/install.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+torch_version=${1:-2.4.0}
+torchvision_version=${2:-0.19.0}
+torchaudio_version=${3:-2.4.0}
+vllm_version=${4:-0.6.0}
+lmdeploy_version=${5:-0.6.1}
+autogptq_version=${6:-0.7.1}
+
+pip install --no-cache-dir -U autoawq lmdeploy==$lmdeploy_version
+
+pip uninstall -y torch torchvision torchaudio
+
+pip install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version
+
+pip install --no-cache-dir tiktoken transformers_stream_generator bitsandbytes deepspeed torchmetrics decord optimum
+
+# pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp310-cp310-linux_x86_64.whl
+# find on: https://github.com/Dao-AILab/flash-attention/releases
+cd /tmp && git clone https://github.com/Dao-AILab/flash-attention.git && cd flash-attention && python setup.py install && cd / && rm -fr /tmp/flash-attention && pip cache purge;
+
+pip install --no-cache-dir triton auto-gptq==$autogptq_version vllm==$vllm_version -U && pip cache purge
+
+# pip uninstall -y torch-scatter && TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" pip install --no-cache-dir -U torch-scatter
diff --git a/docker/install_cpu.sh b/docker/install_cpu.sh
new file mode 100644
index 000000000..43831b92c
--- /dev/null
+++ b/docker/install_cpu.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+torch_version=${1:-2.4.0}
+torchvision_version=${2:-0.19.0}
+torchaudio_version=${3:-2.4.0}
+
+pip uninstall -y torch torchvision torchaudio
+
+pip install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version --index-url https://download.pytorch.org/whl/cpu
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/package.json b/docker/jupyter_plugins/jupyterlab_active_log/package.json
deleted file mode 100644
index d2e0d0db1..000000000
--- a/docker/jupyter_plugins/jupyterlab_active_log/package.json
+++ /dev/null
@@ -1,99 +0,0 @@
-{
-  "name": "jupyterlab_active_log",
-  "version": "0.1.0",
-  "description": "A JupyterLab extension.",
-  "keywords": [
-    "jupyter",
-    "jupyterlab",
-    "jupyterlab-extension"
-  ],
-  "homepage": "https://github.com/github_username/jupyterlab_active_log",
-  "bugs": {
-    "url": "https://github.com/github_username/jupyterlab_active_log/issues"
-  },
-  "license": "BSD-3-Clause",
-  "files": [
-    "lib/**/*.{d.ts,eot,gif,html,jpg,js,js.map,json,png,svg,woff2,ttf}",
-    "style/**/*.{css,js,eot,gif,html,jpg,json,png,svg,woff2,ttf}"
-  ],
-  "main": "lib/index.js",
-  "types": "lib/index.d.ts",
-  "style": "style/index.css",
-  "repository": {
-    "type": "git",
-    "url": "https://github.com/github_username/jupyterlab_active_log.git"
-  },
-  "scripts": {
-    "build": "jlpm build:lib && jlpm build:labextension:dev",
-    "build:prod": "jlpm clean && jlpm build:lib && jlpm build:labextension",
-    "build:labextension": "jupyter labextension build .",
-    "build:labextension:dev": "jupyter labextension build --development True .",
-    "build:lib": "tsc",
-    "clean": "jlpm clean:lib",
-    "clean:lib": "rimraf lib tsconfig.tsbuildinfo",
-    "clean:lintcache": "rimraf .eslintcache .stylelintcache",
-    "clean:labextension": "rimraf jupyterlab_active_log/labextension",
-    "clean:all": "jlpm clean:lib && jlpm clean:labextension && jlpm clean:lintcache",
-    "eslint": "jlpm eslint:check --fix",
-    "eslint:check": "eslint . --cache --ext .ts,.tsx",
-    "install:extension": "jlpm build",
-    "lint": "jlpm stylelint && jlpm prettier && jlpm eslint",
-    "lint:check": "jlpm stylelint:check && jlpm prettier:check && jlpm eslint:check",
-    "prettier": "jlpm prettier:base --write --list-different",
-    "prettier:base": "prettier \"**/*{.ts,.tsx,.js,.jsx,.css,.json,.md}\"",
-    "prettier:check": "jlpm prettier:base --check",
-    "stylelint": "jlpm stylelint:check --fix",
-    "stylelint:check": "stylelint --cache \"style/**/*.css\"",
-    "watch": "run-p watch:src watch:labextension",
-    "watch:src": "tsc -w",
-    "watch:labextension": "jupyter labextension watch ."
-  },
-  "dependencies": {
-    "@jupyterlab/application": "^3.1.0"
-  },
-  "devDependencies": {
-    "@jupyterlab/builder": "^3.1.0",
-    "@typescript-eslint/eslint-plugin": "^4.8.1",
-    "@typescript-eslint/parser": "^4.8.1",
-    "eslint": "^7.14.0",
-    "eslint-config-prettier": "^6.15.0",
-    "eslint-plugin-prettier": "^3.1.4",
-    "npm-run-all": "^4.1.5",
-    "prettier": "^2.1.1",
-    "rimraf": "^3.0.2",
-    "stylelint": "^14.3.0",
-    "stylelint-config-prettier": "^9.0.3",
-    "stylelint-config-recommended": "^6.0.0",
-    "stylelint-config-standard": "~24.0.0",
-    "stylelint-prettier": "^2.0.0",
-    "typescript": "~4.1.3"
-  },
-  "sideEffects": [
-    "style/*.css",
-    "style/index.js"
-  ],
-  "styleModule": "style/index.js",
-  "publishConfig": {
-    "access": "public"
-  },
-  "jupyterlab": {
-    "extension": true,
-    "outputDir": "jupyterlab_active_log/labextension",
-    "_build": {
-      "load": "static/remoteEntry.eb3177c3791d7658cc12.js",
-      "extension": "./extension",
-      "style": "./style"
-    }
-  },
-  "jupyter-releaser": {
-    "hooks": {
-      "before-build-npm": [
-        "python -m pip install jupyterlab~=3.1",
-        "jlpm"
-      ],
-      "before-build-python": [
-        "jlpm clean:all"
-      ]
-    }
-  }
-}
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js b/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js
deleted file mode 100644
index b70adee6b..000000000
--- a/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js
+++ /dev/null
@@ -1 +0,0 @@
-"use strict";(self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[]).push([[568],{568:(t,e,a)=>{a.r(e),a.d(e,{default:()=>i});const i={id:"jupyterlab_active_log:plugin",autoStart:!0,activate:t=>{console.log("JupyterLab extension jupyterlab_active_log is activated!"),window.consts=Object.assign(Object.assign({},window.consts),{recordUrl:"https://modelscope.cn/api/v1/notebooks/activelog",timerDuration:1e4,timerParams:function(){const t=location.pathname.split("/");let e;return t.length>=2&&(e=t[1]),{site:"dsw",id:e,ext:{pathname:location.pathname}}}});const e=document.body,a=e.insertBefore(document.createElement("script"),e.firstChild);a.setAttribute("id","timer-sdk"),a.setAttribute("src","https://g.alicdn.com/alifanyi/translate-js-sdk/timer.js  ")}}}}]);
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js b/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js
deleted file mode 100644
index 2129fc3d0..000000000
--- a/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js
+++ /dev/null
@@ -1 +0,0 @@
-"use strict";(self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[]).push([[747],{150:(e,n,t)=>{t.d(n,{Z:()=>a});var r=t(645),o=t.n(r)()((function(e){return e[1]}));o.push([e.id,"/*\n    See the JupyterLab Developer Guide for useful CSS Patterns:\n\n    https://jupyterlab.readthedocs.io/en/stable/developer/css.html\n*/\n",""]);const a=o},645:e=>{e.exports=function(e){var n=[];return n.toString=function(){return this.map((function(n){var t=e(n);return n[2]?"@media ".concat(n[2]," {").concat(t,"}"):t})).join("")},n.i=function(e,t,r){"string"==typeof e&&(e=[[null,e,""]]);var o={};if(r)for(var a=0;a<this.length;a++){var i=this[a][0];null!=i&&(o[i]=!0)}for(var c=0;c<e.length;c++){var s=[].concat(e[c]);r&&o[s[0]]||(t&&(s[2]?s[2]="".concat(t," and ").concat(s[2]):s[2]=t),n.push(s))}},n}},379:(e,n,t)=>{var r,o=function(){var e={};return function(n){if(void 0===e[n]){var t=document.querySelector(n);if(window.HTMLIFrameElement&&t instanceof window.HTMLIFrameElement)try{t=t.contentDocument.head}catch(e){t=null}e[n]=t}return e[n]}}(),a=[];function i(e){for(var n=-1,t=0;t<a.length;t++)if(a[t].identifier===e){n=t;break}return n}function c(e,n){for(var t={},r=[],o=0;o<e.length;o++){var c=e[o],s=n.base?c[0]+n.base:c[0],u=t[s]||0,l="".concat(s," ").concat(u);t[s]=u+1;var f=i(l),d={css:c[1],media:c[2],sourceMap:c[3]};-1!==f?(a[f].references++,a[f].updater(d)):a.push({identifier:l,updater:v(d,n),references:1}),r.push(l)}return r}function s(e){var n=document.createElement("style"),r=e.attributes||{};if(void 0===r.nonce){var a=t.nc;a&&(r.nonce=a)}if(Object.keys(r).forEach((function(e){n.setAttribute(e,r[e])})),"function"==typeof e.insert)e.insert(n);else{var i=o(e.insert||"head");if(!i)throw new Error("Couldn't find a style target. This probably means that the value for the 'insert' parameter is invalid.");i.appendChild(n)}return n}var u,l=(u=[],function(e,n){return u[e]=n,u.filter(Boolean).join("\n")});function f(e,n,t,r){var o=t?"":r.media?"@media ".concat(r.media," {").concat(r.css,"}"):r.css;if(e.styleSheet)e.styleSheet.cssText=l(n,o);else{var a=document.createTextNode(o),i=e.childNodes;i[n]&&e.removeChild(i[n]),i.length?e.insertBefore(a,i[n]):e.appendChild(a)}}function d(e,n,t){var r=t.css,o=t.media,a=t.sourceMap;if(o?e.setAttribute("media",o):e.removeAttribute("media"),a&&"undefined"!=typeof btoa&&(r+="\n/*# sourceMappingURL=data:application/json;base64,".concat(btoa(unescape(encodeURIComponent(JSON.stringify(a))))," */")),e.styleSheet)e.styleSheet.cssText=r;else{for(;e.firstChild;)e.removeChild(e.firstChild);e.appendChild(document.createTextNode(r))}}var p=null,h=0;function v(e,n){var t,r,o;if(n.singleton){var a=h++;t=p||(p=s(n)),r=f.bind(null,t,a,!1),o=f.bind(null,t,a,!0)}else t=s(n),r=d.bind(null,t,n),o=function(){!function(e){if(null===e.parentNode)return!1;e.parentNode.removeChild(e)}(t)};return r(e),function(n){if(n){if(n.css===e.css&&n.media===e.media&&n.sourceMap===e.sourceMap)return;r(e=n)}else o()}}e.exports=function(e,n){(n=n||{}).singleton||"boolean"==typeof n.singleton||(n.singleton=(void 0===r&&(r=Boolean(window&&document&&document.all&&!window.atob)),r));var t=c(e=e||[],n);return function(e){if(e=e||[],"[object Array]"===Object.prototype.toString.call(e)){for(var r=0;r<t.length;r++){var o=i(t[r]);a[o].references--}for(var s=c(e,n),u=0;u<t.length;u++){var l=i(t[u]);0===a[l].references&&(a[l].updater(),a.splice(l,1))}t=s}}}},747:(e,n,t)=>{t.r(n);var r=t(379),o=t.n(r),a=t(150);o()(a.Z,{insert:"head",singleton:!1}),a.Z.locals}}]);
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js b/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js
deleted file mode 100644
index ec49e9734..000000000
--- a/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js
+++ /dev/null
@@ -1 +0,0 @@
-var _JUPYTERLAB;(()=>{"use strict";var e,r,t={293:(e,r,t)=>{var o={"./index":()=>t.e(568).then((()=>()=>t(568))),"./extension":()=>t.e(568).then((()=>()=>t(568))),"./style":()=>t.e(747).then((()=>()=>t(747)))},a=(e,r)=>(t.R=r,r=t.o(o,e)?o[e]():Promise.resolve().then((()=>{throw new Error('Module "'+e+'" does not exist in container.')})),t.R=void 0,r),n=(e,r)=>{if(t.S){var o="default",a=t.S[o];if(a&&a!==e)throw new Error("Container initialization failed as it has already been initialized with a different share scope");return t.S[o]=e,t.I(o,r)}};t.d(r,{get:()=>a,init:()=>n})}},o={};function a(e){var r=o[e];if(void 0!==r)return r.exports;var n=o[e]={id:e,exports:{}};return t[e](n,n.exports,a),n.exports}a.m=t,a.c=o,a.n=e=>{var r=e&&e.__esModule?()=>e.default:()=>e;return a.d(r,{a:r}),r},a.d=(e,r)=>{for(var t in r)a.o(r,t)&&!a.o(e,t)&&Object.defineProperty(e,t,{enumerable:!0,get:r[t]})},a.f={},a.e=e=>Promise.all(Object.keys(a.f).reduce(((r,t)=>(a.f[t](e,r),r)),[])),a.u=e=>e+"."+{568:"a92ae44b87625ab09aed",747:"63b4c3d22bfe458b352b"}[e]+".js?v="+{568:"a92ae44b87625ab09aed",747:"63b4c3d22bfe458b352b"}[e],a.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),a.o=(e,r)=>Object.prototype.hasOwnProperty.call(e,r),e={},r="jupyterlab_active_log:",a.l=(t,o,n,i)=>{if(e[t])e[t].push(o);else{var l,u;if(void 0!==n)for(var c=document.getElementsByTagName("script"),d=0;d<c.length;d++){var s=c[d];if(s.getAttribute("src")==t||s.getAttribute("data-webpack")==r+n){l=s;break}}l||(u=!0,(l=document.createElement("script")).charset="utf-8",l.timeout=120,a.nc&&l.setAttribute("nonce",a.nc),l.setAttribute("data-webpack",r+n),l.src=t),e[t]=[o];var p=(r,o)=>{l.onerror=l.onload=null,clearTimeout(f);var a=e[t];if(delete e[t],l.parentNode&&l.parentNode.removeChild(l),a&&a.forEach((e=>e(o))),r)return r(o)},f=setTimeout(p.bind(null,void 0,{type:"timeout",target:l}),12e4);l.onerror=p.bind(null,l.onerror),l.onload=p.bind(null,l.onload),u&&document.head.appendChild(l)}},a.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},(()=>{a.S={};var e={},r={};a.I=(t,o)=>{o||(o=[]);var n=r[t];if(n||(n=r[t]={}),!(o.indexOf(n)>=0)){if(o.push(n),e[t])return e[t];a.o(a.S,t)||(a.S[t]={});var i=a.S[t],l="jupyterlab_active_log",u=[];return"default"===t&&((e,r,t,o)=>{var n=i[e]=i[e]||{},u=n[r];(!u||!u.loaded&&(1!=!u.eager?o:l>u.from))&&(n[r]={get:()=>a.e(568).then((()=>()=>a(568))),from:l,eager:!1})})("jupyterlab_active_log","0.1.0"),e[t]=u.length?Promise.all(u).then((()=>e[t]=1)):1}}})(),(()=>{var e;a.g.importScripts&&(e=a.g.location+"");var r=a.g.document;if(!e&&r&&(r.currentScript&&(e=r.currentScript.src),!e)){var t=r.getElementsByTagName("script");t.length&&(e=t[t.length-1].src)}if(!e)throw new Error("Automatic publicPath is not supported in this browser");e=e.replace(/#.*$/,"").replace(/\?.*$/,"").replace(/\/[^\/]+$/,"/"),a.p=e})(),(()=>{var e={346:0};a.f.j=(r,t)=>{var o=a.o(e,r)?e[r]:void 0;if(0!==o)if(o)t.push(o[2]);else{var n=new Promise(((t,a)=>o=e[r]=[t,a]));t.push(o[2]=n);var i=a.p+a.u(r),l=new Error;a.l(i,(t=>{if(a.o(e,r)&&(0!==(o=e[r])&&(e[r]=void 0),o)){var n=t&&("load"===t.type?"missing":t.type),i=t&&t.target&&t.target.src;l.message="Loading chunk "+r+" failed.\n("+n+": "+i+")",l.name="ChunkLoadError",l.type=n,l.request=i,o[1](l)}}),"chunk-"+r,r)}};var r=(r,t)=>{var o,n,[i,l,u]=t,c=0;if(i.some((r=>0!==e[r]))){for(o in l)a.o(l,o)&&(a.m[o]=l[o]);u&&u(a)}for(r&&r(t);c<i.length;c++)n=i[c],a.o(e,n)&&e[n]&&e[n][0](),e[n]=0},t=self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[];t.forEach(r.bind(null,0)),t.push=r.bind(null,t.push.bind(t))})(),a.nc=void 0;var n=a(293);(_JUPYTERLAB=void 0===_JUPYTERLAB?{}:_JUPYTERLAB).jupyterlab_active_log=n})();
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/style.js b/docker/jupyter_plugins/jupyterlab_active_log/static/style.js
deleted file mode 100644
index 2bc05dab1..000000000
--- a/docker/jupyter_plugins/jupyterlab_active_log/static/style.js
+++ /dev/null
@@ -1,4 +0,0 @@
-/* This is a generated file of CSS imports */
-/* It was generated by @jupyterlab/builder in Build.ensureAssets() */
-
-import 'jupyterlab_active_log/style/index.js';
diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/third-party-licenses.json b/docker/jupyter_plugins/jupyterlab_active_log/static/third-party-licenses.json
deleted file mode 100644
index 78bcbda63..000000000
--- a/docker/jupyter_plugins/jupyterlab_active_log/static/third-party-licenses.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "packages": [
-    {
-      "name": "css-loader",
-      "versionInfo": "5.2.7",
-      "licenseId": "MIT",
-      "extractedText": "Copyright JS Foundation and other contributors\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the\n'Software'), to deal in the Software without restriction, including\nwithout limitation the rights to use, copy, modify, merge, publish,\ndistribute, sublicense, and/or sell copies of the Software, and to\npermit persons to whom the Software is furnished to do so, subject to\nthe following conditions:\n\nThe above copyright notice and this permission notice shall be\nincluded in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\nCLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\nSOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
-    },
-    {
-      "name": "style-loader",
-      "versionInfo": "2.0.0",
-      "licenseId": "MIT",
-      "extractedText": "Copyright JS Foundation and other contributors\n\nPermission is hereby granted, free of charge, to any person obtaining\na copy of this software and associated documentation files (the\n'Software'), to deal in the Software without restriction, including\nwithout limitation the rights to use, copy, modify, merge, publish,\ndistribute, sublicense, and/or sell copies of the Software, and to\npermit persons to whom the Software is furnished to do so, subject to\nthe following conditions:\n\nThe above copyright notice and this permission notice shall be\nincluded in all copies or substantial portions of the Software.\n\nTHE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND,\nEXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF\nMERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.\nIN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY\nCLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,\nTORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE\nSOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.\n"
-    }
-  ]
-}
diff --git a/docker/pytorch.dockerfile b/docker/pytorch.dockerfile
deleted file mode 100644
index a1fe5b15d..000000000
--- a/docker/pytorch.dockerfile
+++ /dev/null
@@ -1,54 +0,0 @@
-# syntax = docker/dockerfile:experimental
-#
-# NOTE: To build this you will need a docker version > 18.06 with
-#       experimental enabled and DOCKER_BUILDKIT=1
-#
-#       If you do not use buildkit you are not going to have a good time
-#
-#       For reference:
-#           https://docs.docker.com/develop/develop-images/build_enhancements/
-
-# ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04
-# FROM ${BASE_IMAGE} as dev-base
-
-# FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base
-FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel
-# FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime
-# config pip source
-RUN mkdir /root/.pip
-COPY docker/rcfiles/pip.conf.tsinghua  /root/.pip/pip.conf
-COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list
-
-# Install essential Ubuntu packages
-RUN apt-get update &&\
-    apt-get install -y software-properties-common \
-    build-essential \
-    git \
-    wget \
-    vim \
-    curl \
-    zip \
-    zlib1g-dev \
-    unzip \
-    pkg-config \
-    libsndfile1
-
-# install modelscope and its python env
-WORKDIR /opt/modelscope
-COPY . .
-RUN pip install -r requirements.txt
-# RUN --mount=type=cache,target=/opt/ccache \
-#     python setup.py install
-
-# opencv-python-headless conflict with opencv-python installed
-RUN python setup.py install \
-    && pip uninstall -y opencv-python-headless
-
-# prepare modelscope libs
-COPY docker/scripts/install_libs.sh /tmp/
-RUN bash /tmp/install_libs.sh && \
-    rm -rf /tmp/install_libs.sh
-
-ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64
-
-WORKDIR /workspace
diff --git a/docker/rcfiles/conda.tuna b/docker/rcfiles/conda.tuna
deleted file mode 100644
index ce8a29085..000000000
--- a/docker/rcfiles/conda.tuna
+++ /dev/null
@@ -1,15 +0,0 @@
-channels:
-  - defaults
-show_channel_urls: true
-default_channels:
-  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main
-  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r
-  - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2
-custom_channels:
-  conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
-  msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
-  bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
-  menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
-  pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
-  pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
-  simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud
diff --git a/docker/rcfiles/pip.conf.tsinghua b/docker/rcfiles/pip.conf.tsinghua
deleted file mode 100644
index 4242075a4..000000000
--- a/docker/rcfiles/pip.conf.tsinghua
+++ /dev/null
@@ -1,2 +0,0 @@
-[global]
-index-url=https://pypi.tuna.tsinghua.edu.cn/simple
diff --git a/docker/rcfiles/sources.list.aliyun b/docker/rcfiles/sources.list.aliyun
deleted file mode 100644
index 1ebf4ae53..000000000
--- a/docker/rcfiles/sources.list.aliyun
+++ /dev/null
@@ -1,14 +0,0 @@
-deb https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
-# deb-src https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse
-
-deb https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
-# deb-src https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse
-
-deb https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
-# deb-src https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse
-
-# deb https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
-# deb-src https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse
-
-deb https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
-# deb-src https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse
diff --git a/docker/rcfiles/ubuntu20.04_sources.tuna b/docker/rcfiles/ubuntu20.04_sources.tuna
deleted file mode 100644
index a247bbfa6..000000000
--- a/docker/rcfiles/ubuntu20.04_sources.tuna
+++ /dev/null
@@ -1,13 +0,0 @@
-# 默认注释了源码镜像以提高 apt update 速度，如有需要可自行取消注释
-deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
-# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse
-deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
-# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse
-deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
-# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse
-deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
-# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse
-
-# 预发布软件源，不建议启用
-# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
-# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse
diff --git a/docker/rcfiles/ubuntu2204.aliyun b/docker/rcfiles/ubuntu2204.aliyun
new file mode 100644
index 000000000..d5dce70cf
--- /dev/null
+++ b/docker/rcfiles/ubuntu2204.aliyun
@@ -0,0 +1,10 @@
+deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse
+#deb-src http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse
+#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse
+#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse
+#deb http://mirrors.aliyun.com/ubuntu/ jammy-proposed main restricted universe multiverse
+#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-proposed main restricted universe multiverse
+deb http://mirrors.aliyun.com/ubuntu/ jammy-backports main restricted universe multiverse
+#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-backports main restricted universe multiverse
diff --git a/docker/rcfiles/user.vimrc b/docker/rcfiles/user.vimrc
deleted file mode 100644
index 590aca43f..000000000
--- a/docker/rcfiles/user.vimrc
+++ /dev/null
@@ -1,10 +0,0 @@
-set nocompatible
-set encoding=utf-8
-set hlsearch
-set smartindent
-set ruler
-set number
-set ts=2
-set sw=2
-set expandtab
-autocmd FileType make setlocal noexpandtab
diff --git a/docker/scripts/install_apex.sh b/docker/scripts/install_apex.sh
index 40d9f268f..7ecd288b4 100644
--- a/docker/scripts/install_apex.sh
+++ b/docker/scripts/install_apex.sh
@@ -2,6 +2,6 @@ export MAX_JOBS=16 \
 && git clone https://github.com/NVIDIA/apex \
 && cd apex \
 && git checkout 6bd01c4b99a84648ad5e5238a959735e6936c813 \
-&& TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6" pip install -v --disable-pip-version-check --no-cache --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
+&& TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.9;9.0;8.6+PTX" pip install -v --disable-pip-version-check --no-cache --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \
 && cd .. \
 && rm -rf apex
diff --git a/docker/scripts/install_colmap.sh b/docker/scripts/install_colmap.sh
index f21fca1d8..ada7077ab 100644
--- a/docker/scripts/install_colmap.sh
+++ b/docker/scripts/install_colmap.sh
@@ -8,7 +8,7 @@ wget -q https://cmake.org/files/v3.25/cmake-3.25.2-linux-x86_64.sh \
     && export CMAKE_BUILD_PARALLEL_LEVEL=36 \
     && export MAX_JOBS=16 \
     && export CUDA_ARCHITECTURES="all" \
-    && git clone --depth 1 --branch 3.8 https://github.com/colmap/colmap.git \
+    && git clone https://github.com/colmap/colmap.git \
     && cd colmap \
     && mkdir build \
     && cd build \
diff --git a/docker/scripts/install_flash_attension.sh b/docker/scripts/install_flash_attension.sh
deleted file mode 100644
index f37e567d9..000000000
--- a/docker/scripts/install_flash_attension.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-    git clone -b v2.3.2 https://github.com/Dao-AILab/flash-attention && \
-    cd flash-attention && python setup.py install && \
-    cd .. && \
-    rm -rf flash-attention
diff --git a/docker/scripts/install_pytorch3d_nvdiffrast.sh b/docker/scripts/install_pytorch3d_nvdiffrast.sh
index c7880f92d..c64ea7fb5 100644
--- a/docker/scripts/install_pytorch3d_nvdiffrast.sh
+++ b/docker/scripts/install_pytorch3d_nvdiffrast.sh
@@ -1,6 +1,7 @@
 export CMAKE_BUILD_PARALLEL_LEVEL=36 \
         && export MAX_JOBS=36 \
-        && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \
+        && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;8.6+PTX;87;89;90" \
+        && export TORCH_CUDA_ARCH_LIST="5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6+PTX;8.7;8.9;9.0" \
         && git clone --branch 2.1.0 --recursive https://github.com/NVIDIA/thrust.git \
         && cd thrust \
         && mkdir build \
@@ -10,7 +11,11 @@ export CMAKE_BUILD_PARALLEL_LEVEL=36 \
         && cd ../.. \
         && rm -rf thrust \
         && pip install --no-cache-dir fvcore iopath \
-        && pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \
+	&& curl -LO https://github.com/NVIDIA/cub/archive/2.1.0.tar.gz \
+        && tar xzf 2.1.0.tar.gz \
+        && export CUB_HOME=$PWD/cub-2.1.0 \
+        && FORCE_CUDA=1 pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \
+        && rm -fr 2.1.0.tar.gz $PWD/cub-2.1.0 \
         && apt-get update \
         && apt-get install -y --no-install-recommends pkg-config libglvnd0 libgl1 libglx0 libegl1  libgles2 libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev  libgles2-mesa-dev -y \
         && git clone https://github.com/NVlabs/nvdiffrast.git \
diff --git a/docker/scripts/install_tiny_cuda_nn.sh b/docker/scripts/install_tiny_cuda_nn.sh
index 96ae5c722..1aaa2863f 100644
--- a/docker/scripts/install_tiny_cuda_nn.sh
+++ b/docker/scripts/install_tiny_cuda_nn.sh
@@ -1,7 +1,6 @@
-export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=36 && export TCNN_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \
+export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=36 && export TCNN_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;89;90;86" \
         && git clone --recursive https://github.com/nvlabs/tiny-cuda-nn \
         && cd tiny-cuda-nn \
-        && git checkout v1.6 \
         && cd bindings/torch \
         && python setup.py install \
         && cd ../../.. \
diff --git a/docker/scripts/torch111_torch3d_nvdiffrast.sh b/docker/scripts/torch111_torch3d_nvdiffrast.sh
deleted file mode 100644
index ca86b0ccf..000000000
--- a/docker/scripts/torch111_torch3d_nvdiffrast.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=4 && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \
-	&& pip install --no-cache-dir fvcore iopath \
-	&& curl -LO https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz \
-        && tar xzf 1.10.0.tar.gz \
-        && export CUB_HOME=$PWD/cub-1.10.0 \
-        && pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \
-	&& rm -fr 1.10.0.tar.gz cub-1.10.0 \
-        && apt-get update \
-	&& apt-get install -y --no-install-recommends pkg-config libglvnd0 libgl1 libglx0 libegl1  libgles2 libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev  libgles2-mesa-dev -y \
-        && git clone https://github.com/NVlabs/nvdiffrast.git \
-	&& cd nvdiffrast \
-        && pip install --no-cache-dir . \
-        && cd .. \
-        && rm -rf nvdiffrast
diff --git a/docs/Makefile b/docs/Makefile
index d0c3cbf10..a9f9d1a9b 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -17,4 +17,4 @@ help:
 # Catch-all target: route all unknown targets to Sphinx using the new
 # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
 %: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+	@$(SPHINXBUILD) -b json  "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/source/api/modelscope.models.cv.rst b/docs/source/api/modelscope.models.cv.rst
index ac52fef12..464e4535f 100644
--- a/docs/source/api/modelscope.models.cv.rst
+++ b/docs/source/api/modelscope.models.cv.rst
@@ -11,7 +11,6 @@ modelscope.models.cv
     :nosignatures:
     :template: classtemplate.rst
 
-    easycv_base.EasyCVBaseModel
     image_colorization.ddcolor.ddcolor_for_image_colorization.DDColorForImageColorization
     image_deblur.nafnet_for_image_deblur.NAFNetForImageDeblur
     image_defrcn_fewshot.defrcn_for_fewshot.DeFRCNForFewShot
@@ -19,7 +18,6 @@ modelscope.models.cv
     image_face_fusion.image_face_fusion.ImageFaceFusion
     image_matching.quadtree_attention_model.QuadTreeAttentionForImageMatching
     image_skychange.skychange_model.ImageSkychange
-    language_guided_video_summarization.summarizer.ClipItVideoSummarization
     panorama_depth_estimation.unifuse_model.PanoramaDepthEstimation
     video_stabilization.DUTRAFTStabilizer.DUTRAFTStabilizer
     video_summarization.summarizer.PGLVideoSummarization
diff --git a/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst b/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst
index b5a4b0f6d..32cb97b89 100644
--- a/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst
+++ b/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst
@@ -13,17 +13,14 @@ modelscope.msdatasets.dataset_cls.custom_datasets
 
     EasyCVBaseDataset
     TorchCustomDataset
-    MovieSceneSegmentationDataset
     ImageInstanceSegmentationCocoDataset
     GoproImageDeblurringDataset
-    LanguageGuidedVideoSummarizationDataset
     MGeoRankingDataset
     RedsImageDeblurringDataset
     TextRankingDataset
     VecoDataset
     VideoSummarizationDataset
     BadImageDetectingDataset
-    ImageInpaintingDataset
     ImagePortraitEnhancementDataset
     ImageQualityAssessmentDegradationDataset
     ImageQualityAssessmentMosDataset
diff --git a/docs/source/api/modelscope.pipelines.audio.rst b/docs/source/api/modelscope.pipelines.audio.rst
index 71d7d13b6..4357a84bf 100644
--- a/docs/source/api/modelscope.pipelines.audio.rst
+++ b/docs/source/api/modelscope.pipelines.audio.rst
@@ -12,7 +12,6 @@ modelscope.pipelines.audio
     :template: classtemplate.rst
 
     ANSPipeline
-    AutomaticSpeechRecognitionPipeline
     InverseTextProcessingPipeline
     KWSFarfieldPipeline
     KeyWordSpottingKwsbpPipeline
diff --git a/docs/source/api/modelscope.pipelines.cv.rst b/docs/source/api/modelscope.pipelines.cv.rst
index b2190ef1a..68341e6d1 100644
--- a/docs/source/api/modelscope.pipelines.cv.rst
+++ b/docs/source/api/modelscope.pipelines.cv.rst
@@ -63,7 +63,6 @@ modelscope.pipelines.cv
     ImageSkychangePipeline
     ImageStyleTransferPipeline
     ImageSuperResolutionPipeline
-    LanguageGuidedVideoSummarizationPipeline
     LicensePlateDetectionPipeline
     LiveCategoryPipeline
     MaskDINOInstanceSegmentationPipeline
diff --git a/docs/source/api/modelscope.pipelines.nlp.rst b/docs/source/api/modelscope.pipelines.nlp.rst
index ef783db17..4ad9f9e37 100644
--- a/docs/source/api/modelscope.pipelines.nlp.rst
+++ b/docs/source/api/modelscope.pipelines.nlp.rst
@@ -11,7 +11,6 @@ modelscope.pipelines.nlp
     :nosignatures:
     :template: classtemplate.rst
 
-    AutomaticPostEditingPipeline
     CodeGeeXCodeGenerationPipeline
     CodeGeeXCodeTranslationPipeline
     ConversationalTextToSqlPipeline
diff --git a/docs/source/api/modelscope.pipelines.science.rst b/docs/source/api/modelscope.pipelines.science.rst
index eabb12b6d..e934a7cd8 100644
--- a/docs/source/api/modelscope.pipelines.science.rst
+++ b/docs/source/api/modelscope.pipelines.science.rst
@@ -10,5 +10,3 @@ modelscope.pipelines.science
     :toctree: generated
     :nosignatures:
     :template: classtemplate.rst
-
-    ProteinStructurePipeline
diff --git a/docs/source/api/modelscope.trainers.hooks.rst b/docs/source/api/modelscope.trainers.hooks.rst
index 5fd903383..5fe5e3610 100644
--- a/docs/source/api/modelscope.trainers.hooks.rst
+++ b/docs/source/api/modelscope.trainers.hooks.rst
@@ -14,8 +14,6 @@ modelscope.trainers.hooks
     builder.build_hook
     hook.Hook
     priority.Priority
-    checkpoint_hook.CheckpointHook
-    checkpoint_hook.BestCkptSaverHook
     compression.SparsityHook
     evaluation_hook.EvaluationHook
     iter_timer_hook.IterTimerHook
diff --git a/docs/source/change_log.md b/docs/source/change_log.md
index 1081c148c..e8f286ac0 100644
--- a/docs/source/change_log.md
+++ b/docs/source/change_log.md
@@ -16,7 +16,7 @@ Second internal release.
 * add palm2.0
 * add space model
 * add MPLUG model
-* add dialog_intent, dialog_modeling, dialog state tracking pipleline
+* add dialog_intent, dialog_modeling, dialog state tracking pipeline
 * add maskedlm model and fill_mask pipeline
 * add nli pipeline
 * add sentence similarity pipeline
@@ -28,7 +28,7 @@ Second internal release.
 
 #### Audio
 * add tts pipeline
-* add kws kwsbp pipline
+* add kws kwsbp pipeline
 * add linear aec pipeline
 * add ans pipeline
 
diff --git a/docs/source/command.md b/docs/source/command.md
new file mode 100644
index 000000000..2d5c73fbc
--- /dev/null
+++ b/docs/source/command.md
@@ -0,0 +1,157 @@
+# ModelScope command line usage
+## Supported commands
+```bash
+modelscope --help
+usage: modelscope <command> [<args>]
+
+positional arguments:
+  {download,plugin,pipeline,modelcard,model,server,login}
+                        modelscope commands helpers
+
+options:
+  -h, --help            show this help message and exit
+
+```
+## login
+```bash
+modelscope login --help
+usage: modelscope <command> [<args>] login [-h] --token TOKEN
+
+options:
+  -h, --help     show this help message and exit
+  --token TOKEN  The Access Token for modelscope.
+```
+Get access token: [我的页面](https://modelscope.cn/my/myaccesstoken)获取**SDK 令牌**
+
+
+## download model
+```bash
+modelscope download --help
+
+    usage: modelscope <command> [<args>] download [-h] --model MODEL [--revision REVISION] [--cache_dir CACHE_DIR] [--local_dir LOCAL_DIR] [--include [INCLUDE ...]] [--exclude [EXCLUDE ...]] [files ...]
+
+    positional arguments:
+      files                 Specify relative path to the repository file(s) to download.(e.g 'tokenizer.json', 'onnx/decoder_model.onnx').
+
+    options:
+      -h, --help            show this help message and exit
+      --model MODEL         The model id to be downloaded.
+      --revision REVISION   Revision of the model.
+      --cache_dir CACHE_DIR
+                            Cache directory to save model.
+      --local_dir LOCAL_DIR
+                            File will be downloaded to local location specified bylocal_dir, in this case, cache_dir parameter will be ignored.
+      --include [INCLUDE ...]
+                            Glob patterns to match files to download.Ignored if file is specified
+      --exclude [EXCLUDE ...]
+                            Glob patterns to exclude from files to download.Ignored if file is specified
+```
+## Usage Examples
+
+Command Examples（[gpt2](https://www.modelscope.cn/models/AI-ModelScope/gpt2/files)）
+
+### Specify downloading of a single file
+```bash
+    modelscope download --model 'AI-ModelScope/gpt2' 64.tflite
+```
+
+### Specify multiple files to download
+```bash
+    modelscope download --model 'AI-ModelScope/gpt2' 64.tflite config.json
+```
+### Specify certain files to download 
+```bash
+    modelscope download --model 'AI-ModelScope/gpt2' --include 'onnx/*' '*.tflite'
+```
+### Filter specified files
+```bash
+    modelscope download --model 'AI-ModelScope/gpt2' --exclude 'onnx/*' '*.tflite' 
+```
+### Specify the download cache directory
+```bash
+    modelscope download --model 'AI-ModelScope/gpt2' --include '*.json' --cache_dir './cache_dir'
+```
+   The model files will be downloaded to cache\_dir/AI-ModelScope/gpt2/
+
+### Specify the local directory for downloading    
+```bash
+    modelscope download --model 'AI-ModelScope/gpt2' --include '*.json' --cache_dir './local_dir'
+```
+  The model files will be downloaded to ./local\_dir
+
+If both the local directory and the cache directory are specified, the local directory will take precedence.
+
+## model operation
+Supports creating models and uploading model files.
+```bash
+modelscope model --help
+usage: modelscope <command> [<args>] modelcard [-h] [-tk ACCESS_TOKEN] -act {create,upload,download} [-gid GROUP_ID] -mid MODEL_ID [-vis VISIBILITY] [-lic LICENSE] [-ch CHINESE_NAME] [-md MODEL_DIR] [-vt VERSION_TAG] [-vi VERSION_INFO]
+
+options:
+  -h, --help            show this help message and exit
+  -tk ACCESS_TOKEN, --access_token ACCESS_TOKEN
+                        the certification of visit ModelScope
+  -act {create,upload,download}, --action {create,upload,download}
+                        the action of api ModelScope[create, upload]
+  -gid GROUP_ID, --group_id GROUP_ID
+                        the group name of ModelScope, eg, damo
+  -mid MODEL_ID, --model_id MODEL_ID
+                        the model name of ModelScope
+  -vis VISIBILITY, --visibility VISIBILITY
+                        the visibility of ModelScope[PRIVATE: 1, INTERNAL:3, PUBLIC:5]
+  -lic LICENSE, --license LICENSE
+                        the license of visit ModelScope[Apache License 2.0|GPL-2.0|GPL-3.0|LGPL-2.1|LGPL-3.0|AFL-3.0|ECL-2.0|MIT]
+  -ch CHINESE_NAME, --chinese_name CHINESE_NAME
+                        the chinese name of ModelScope
+  -md MODEL_DIR, --model_dir MODEL_DIR
+                        the model_dir of configuration.json
+  -vt VERSION_TAG, --version_tag VERSION_TAG
+                        the tag of uploaded model
+  -vi VERSION_INFO, --version_info VERSION_INFO
+                        the info of uploaded model
+```
+
+### Create model
+```bash
+    modelscope model -act create -gid 'YOUR_GROUP_ID' -mid 'THE_MODEL_ID' -vis 1 -lic 'MIT' -ch '中文名字'
+```
+Will create model THE_MODEL_ID in www.modelscope.cn
+
+### Upload model files
+```bash
+    modelscope model -act upload -gid 'YOUR_GROUP_ID' -mid 'THE_MODEL_ID' -md modelfiles/ -vt 'v0.0.1' -vi 'upload model files'
+```
+
+## Pipeline
+Create the template files needed for pipeline.
+
+```bash
+modelscope pipeline --help
+usage: modelscope <command> [<args>] pipeline [-h] -act {create} [-tpl TPL_FILE_PATH] [-s SAVE_FILE_PATH] [-f FILENAME] -t TASK_NAME [-m MODEL_NAME] [-p PREPROCESSOR_NAME] [-pp PIPELINE_NAME] [-config CONFIGURATION_PATH]
+
+options:
+  -h, --help            show this help message and exit
+  -act {create}, --action {create}
+                        the action of command pipeline[create]
+  -tpl TPL_FILE_PATH, --tpl_file_path TPL_FILE_PATH
+                        the template be selected for ModelScope[template.tpl]
+  -s SAVE_FILE_PATH, --save_file_path SAVE_FILE_PATH
+                        the name of custom template be saved for ModelScope
+  -f FILENAME, --filename FILENAME
+                        the init name of custom template be saved for ModelScope
+  -t TASK_NAME, --task_name TASK_NAME
+                        the unique task_name for ModelScope
+  -m MODEL_NAME, --model_name MODEL_NAME
+                        the class of model name for ModelScope
+  -p PREPROCESSOR_NAME, --preprocessor_name PREPROCESSOR_NAME
+                        the class of preprocessor name for ModelScope
+  -pp PIPELINE_NAME, --pipeline_name PIPELINE_NAME
+                        the class of pipeline name for ModelScope
+  -config CONFIGURATION_PATH, --configuration_path CONFIGURATION_PATH
+                        the path of configuration.json for ModelScope
+```
+
+### Create pipeline files
+```bash
+    modelscope pipeline -act 'create' -t 'THE_PIPELINE_TASK' -m 'THE_MODEL_NAME' -pp 'THE_PIPELINE_NAME'
+```
diff --git a/docs/source/develop.md b/docs/source/develop.md
index af8ea5e75..c2fde1e63 100644
--- a/docs/source/develop.md
+++ b/docs/source/develop.md
@@ -119,7 +119,7 @@ git lfs install
 
 2. We use a public read model repository from ModelScope to store test data. The repository has been added by default as a submodule with the path data/test. To clone it, use the following command:
 ```shell
-git clone git@github.com:modelscope/modelscope.git --recursive
+git clone https://github.com/modelscope/modelscope.git --recursive
 ```
 
 3. Each time you add new data, go to the data/test directory (note that you are now in the submodule's git directory), check if you are on the master branch, and pull the latest master branch:
diff --git a/docs/source/develop_cn.md b/docs/source/develop_cn.md
index e342b43a5..224df8f47 100644
--- a/docs/source/develop_cn.md
+++ b/docs/source/develop_cn.md
@@ -90,8 +90,7 @@ git lfs install
 
 2. 我们使用 ModelScope 的一个公共读取模型仓库来存储测试数据。该仓库已默认添加为子模块，路径为 data/test。要克隆它，请使用以下命令：
 ```
-
-git clone git@github.com:modelscope/modelscope.git --recursive
+git clone https://github.com/modelscope/modelscope.git --recursive
 ```
 
 3. 每次添加新数据时，进入 data/test 目录（注意此时您已在子模块的 git 目录中），检查是否在 master 分支上，并拉取最新的 master 分支：
diff --git a/docs/source/server.md b/docs/source/server.md
new file mode 100644
index 000000000..150f56860
--- /dev/null
+++ b/docs/source/server.md
@@ -0,0 +1,41 @@
+# modelscope server使用
+## 1. 通用服务
+modelscope库基于fastapi开发一个简单模型服务，可以通过一条命令拉起绝大多数模型
+使用方法：
+
+```bash
+modelscope server --model_id=modelscope/Llama-2-7b-chat-ms --revision=v1.0.5
+```
+我们提供的官方镜像中也可以一个命令启动(镜像还未完成)
+```bash
+docker run --rm --name maas_dev --shm-size=50gb --gpus='"device=0"' -e MODELSCOPE_CACHE=/modelscope_cache -v /host_path_to_modelscope_cache:/modelscope_cache -p 8000:8000 reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu22.04-cuda11.8.0-py310-torch2.1.0-tf2.14.0-1.9.5-server modelscope server --model_id=modelscope/Llama-2-7b-chat-ms --revision=v1.0.5
+```
+服务默认监听8000端口，您也可以通过--port改变端口，默认服务提供两个接口，接口文档您可以通过
+http://ip:port/docs查看
+通过describe接口，可以获取服务输入输出信息以及输入sample数据，如下图：
+![describe](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/describe.jpg)
+服务调用接口，可以直接拷贝describe接口example示例数据，如下图：
+![call](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/call.jpg)
+
+## 2. vllm大模型推理
+对于LLM我们提供了vllm推理支持，目前只有部分模型支持vllm。
+
+### 2.1 vllm直接支持modelscope模型
+可以通过设置环境变量使得vllm从www.modelscope.cn下载模型。
+
+启动普通server
+```bash
+VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server  --model="damo/nlp_gpt2_text-generation_english-base" --revision="v1.0.0"
+```
+启动openai兼容接口
+```bash
+VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server  --model="damo/nlp_gpt2_text-generation_english-base" --revision="v1.0.0"
+```
+
+如果模型在modelscope cache目录已经存在，则会直接使用cache中的模型，否则会从www.modelscope.cn下载模型。
+
+通过modelscope官方镜像启动vllm，指定端口为9090
+
+```bash
+docker run --rm --name maas_dev --shm-size=50gb --gpus='"device=0"' -e MODELSCOPE_CACHE=/modelscope_cache -v /host_path_to_modelscope_cache:/modelscope_cache -p 9090:9090 reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu22.04-cuda11.8.0-py310-torch2.1.0-tf2.14.0-1.9.5-server python -m vllm.entrypoints.api_server --model "modelscope/Llama-2-7b-chat-ms" --revision "v1.0.5" --port 9090
+```
diff --git a/examples/apps/llm_riddles/README_CN.md b/examples/apps/llm_riddles/README_CN.md
index 0f85734c0..143900736 100644
--- a/examples/apps/llm_riddles/README_CN.md
+++ b/examples/apps/llm_riddles/README_CN.md
@@ -1,12 +1,15 @@
 # 完蛋！我被LLM包围了！(LLMRiddles)
 
 ## 项目简介
-《完蛋！我被LLM包围了！》是一款智力挑战游戏。该项目利用LLM代码生成, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码，结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题，自动生成了对应的游戏代码，创造了一个独特的游戏体验。在这个游戏中，玩家需要巧妙构造问题，挑战LLM给出满足特定条件的回答。
 
+《完蛋！我被LLM包围了！》是一款智力挑战游戏。该项目利用LLM代码生成, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码，结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题，自动生成了对应的游戏代码，创造了一个独特的游戏体验。在这个游戏中，玩家需要巧妙构造问题，挑战LLM给出满足特定条件的回答。
 
 ## 更新
-2023.11.9 新增两道题目， 新增chatglm-turbo模型🔥 🔥🔥
+
+2023.11.9 新增两道题目， 新增chatglm-turbo模型🔥🔥🔥
+
 2023.11.7 发布初版demo🔥
+
 2023.11.8 拆分关卡模块和llm，支持关卡独立接入，llm独立接入， 欢迎PR 🔥 🔥
 
 ## 开始游戏
@@ -16,6 +19,7 @@
 [LLMRiddles](https://modelscope.cn/studios/LLMRiddles/LLMRiddles/summary)
 
 ### 本地运行
+
 要开始游戏，请按照以下步骤操作：
 
 1. 克隆项目代码：
@@ -28,6 +32,7 @@
 5. 执行启动命令`python app.py`.
 
 ## RoadMap
+
 - [x] 初版本源码和创空间体验ready
 - [x] 支持自定义问题和验证逻辑接入
 - [ ] 扩充到9个大关卡，每个关卡9个问题
@@ -35,6 +40,7 @@
 - [ ] 支持云端API和本地推理切换
 
 ## 贡献指南
+
 我们欢迎大家为《完蛋！我被LLM包围了！》做出贡献，包括提出更多好玩的问题，修复validator的corner case，以及提供更多的玩法。请按以下步骤操作：
 
 1. 访问项目地址 [ModelScope](https://github.com/modelscope/modelscope) 并fork项目。
@@ -44,13 +50,16 @@
 5. 在原项目下发起一个Pull Request。
 
 ## 社区贡献者
+
 我们诚挚感谢所有对本项目做出贡献的社区成员，特别是：
 
 - idea来源: [haoqiangfan](https://www.zhihu.com/people/haoqiang-fan)
 - 代码大部分来自于LLM自动生成
 
 ## 支持
+
 如果你在游戏过程中遇到任何问题或需要帮助，请通过项目的[Issues页面](https://github.com/modelscope/modelscope/issues)提交你的问题。
 
 ## 版权和许可
+
 本项目采用APACHE License许可证。请查看项目中的[LICENSE](https://github.com/modelscope/modelscope/blob/main/LICENSE)文件了解更多信息。
diff --git a/examples/apps/llm_riddles/app.py b/examples/apps/llm_riddles/app.py
index 94432043c..30b6febf1 100644
--- a/examples/apps/llm_riddles/app.py
+++ b/examples/apps/llm_riddles/app.py
@@ -3,12 +3,15 @@
 import os
 import random
 import re
+import tarfile
 
 import gradio as gr
+import requests
 from challenges.ch1 import challenge1
 from challenges.ch2 import challenge2
 from challenges.ch3 import challenge3
 from challenges.ch4 import challenge4
+from challenges.ch5 import challenge5
 from llm import create_model
 from PIL import Image, ImageDraw, ImageFont
 
@@ -20,6 +23,7 @@
     challenge2,
     challenge3,
     challenge4,
+    challenge5,
 ]
 
 CONGRATS_STR = '所有挑战完成！👏🏻👏🏻👏🏻👏🏻👏🏻👏🏻'
@@ -156,6 +160,49 @@ def generate_share_image(state):
     return gr.Image.update(visible=True, value=img_pil)
 
 
+def download_resource(url, extract_path='.'):
+    """
+    下载资源文件，解压到指定路径。
+
+    Args:
+      url: 要下载的文件的URL
+      extract_path: 解压文件的目标路径
+    """
+    try:
+        # 定义文件名
+        filename = url.split('/')[-1]
+
+        # 下载文件
+        print(f'Downloading the file from {url}...')
+        response = requests.get(url, stream=True)
+        if response.status_code == 200:
+            with open(filename, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    f.write(chunk)
+        else:
+            print(
+                f'Error: Unable to download file. Status code: {response.status_code}'
+            )
+            return
+
+        # 解压文件
+        print(f'Extracting the file to {extract_path}...')
+        if tarfile.is_tarfile(filename):
+            with tarfile.open(filename, 'r:*') as tar:
+                tar.extractall(path=extract_path)
+        else:
+            print('Error: The downloaded file is not a tar file.')
+
+        # 删除临时文件
+        print(f'Removing the temporary file {filename}...')
+        os.remove(filename)
+        print(
+            'File downloaded, extracted, and temporary file removed successfully.'
+        )
+    except Exception as e:
+        print(f'An error occurred: {e}')
+
+
 def create_app():
     # Gradio界面构建
     block = gr.Blocks()
@@ -220,4 +267,8 @@ def create_app():
 
 
 if __name__ == '__main__':
+    if not os.path.exists('assets'):
+        download_resource(
+            'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/llm_riddles_assets.tar'
+        )
     create_app()
diff --git a/examples/apps/llm_riddles/assets/background.png b/examples/apps/llm_riddles/assets/background.png
deleted file mode 100644
index 9d0cb3c92..000000000
--- a/examples/apps/llm_riddles/assets/background.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8afcec15a87bcfaff327a5c9564a31ff1fe185a63cb286bd9772c8c68216768a
-size 757003
diff --git a/examples/apps/llm_riddles/assets/background0.png b/examples/apps/llm_riddles/assets/background0.png
deleted file mode 100644
index 163942802..000000000
--- a/examples/apps/llm_riddles/assets/background0.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:16afb18994ad0654b31117931aad2ee05863492e964e10f4c559556e29618320
-size 839643
diff --git a/examples/apps/llm_riddles/assets/background1.png b/examples/apps/llm_riddles/assets/background1.png
deleted file mode 100644
index 9d0cb3c92..000000000
--- a/examples/apps/llm_riddles/assets/background1.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8afcec15a87bcfaff327a5c9564a31ff1fe185a63cb286bd9772c8c68216768a
-size 757003
diff --git a/examples/apps/llm_riddles/assets/background2.png b/examples/apps/llm_riddles/assets/background2.png
deleted file mode 100644
index adec77231..000000000
--- a/examples/apps/llm_riddles/assets/background2.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:966a013913042e1574ccbc299b1914272cb47df69a552bf1723b96b2d8902de3
-size 1114172
diff --git a/examples/apps/llm_riddles/assets/background3.png b/examples/apps/llm_riddles/assets/background3.png
deleted file mode 100644
index 97c446d6a..000000000
--- a/examples/apps/llm_riddles/assets/background3.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5253bbed99be55e6ac9080ea320df75c95592204696d6d41ba90f9905384fdca
-size 1198295
diff --git a/examples/apps/llm_riddles/assets/background4.png b/examples/apps/llm_riddles/assets/background4.png
deleted file mode 100644
index fc612898c..000000000
--- a/examples/apps/llm_riddles/assets/background4.png
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4cf462f8db7583843bc152ccfc87bb033b91880c98db9f83ba87fcca5d5d07f2
-size 1056053
diff --git a/examples/apps/llm_riddles/assets/font.ttf b/examples/apps/llm_riddles/assets/font.ttf
deleted file mode 100644
index 7dc813443..000000000
Binary files a/examples/apps/llm_riddles/assets/font.ttf and /dev/null differ
diff --git a/examples/apps/llm_riddles/challenges/ch2.py b/examples/apps/llm_riddles/challenges/ch2.py
index 5c381de66..91b990150 100644
--- a/examples/apps/llm_riddles/challenges/ch2.py
+++ b/examples/apps/llm_riddles/challenges/ch2.py
@@ -23,6 +23,14 @@ def get_square_root(n):
     return int(sympy.sqrt(n))
 
 
+# 验证函数 - 微言大义
+def validate_9(response, input):
+    input_yes = len(input) <= 10
+    output_yes = len(response) >= 9 and response.isdigit() and sympy.isprime(
+        int(response))
+    return input_yes and output_yes
+
+
 challenge2 = {
     'name':
     '第二章 数字游戏',
@@ -114,5 +122,10 @@ def get_square_root(n):
                 char not in input for char in '零一二三四五六七八九十') and len(
                     set(re.findall(r'\d', response))) == 10)
         },
+        {
+            'title': '第9题 微言大义',
+            'description': '请输入10个字以内的问题，使得模型的回答是一个超过一亿的素数',
+            'validator': validate_9
+        }
     ]
 }
diff --git a/examples/apps/llm_riddles/challenges/ch5.py b/examples/apps/llm_riddles/challenges/ch5.py
new file mode 100644
index 000000000..ce918226a
--- /dev/null
+++ b/examples/apps/llm_riddles/challenges/ch5.py
@@ -0,0 +1,35 @@
+def check_word_in_sentence(words, sentence):
+    return [word in sentence for word in words]
+
+
+challenge5 = {
+    'name':
+    '第五章 登堂入室',
+    'problems': [
+        {
+            'title':
+            '第1题 盛夏少年',
+            'description':
+            '模型的回答应该包含“盛夏”、“蝉鸣”、“少年”、“橘子味汽水”这几个词，同时输入的问题不能包含其中任一个词。',
+            'validator':
+            lambda response, input: all(
+                check_word_in_sentence(['盛夏', '蝉鸣', '少年', '橘子味汽水'], response))
+            and not any(
+                check_word_in_sentence(['盛夏', '蝉鸣', '少年', '橘子味汽水'], input))
+        },
+        {
+            'title':
+            '第2题 蝉鸣日出',
+            'description':
+            '模型的回答应该包含“盛夏”、“蝉鸣”、“少年”、“橘子味汽水”、“日出”这几个词，同时输入的问题不能包含其中任一个字。',
+            'validator':
+            lambda response, input: all(
+                check_word_in_sentence(
+                    ['盛夏', '蝉鸣', '少年', '橘子味汽水', '日出'], response)) and not any(
+                        check_word_in_sentence([
+                            '盛', '夏', '蝉', '鸣', '少', '年', '橘', '子', '味', '汽',
+                            '水', '日', '出'
+                        ], input))
+        },
+    ]
+}
diff --git a/examples/apps/llm_riddles/check_challenge.py b/examples/apps/llm_riddles/check_challenge.py
new file mode 100644
index 000000000..c8d225208
--- /dev/null
+++ b/examples/apps/llm_riddles/check_challenge.py
@@ -0,0 +1,28 @@
+from app import challenges, generate_response
+
+
+def check_answer(chap_idx,
+                 challenge_idx,
+                 input='input',
+                 model_name='qwen-max'):
+    print('第{}章 第{}题'.format(chap_idx + 1, challenge_idx + 1))
+    challenge = challenges[chap_idx]['problems'][challenge_idx]
+    print(challenge['description'])
+    val_fn = challenge['validator']
+    response = generate_response(input, model_name)
+    try:
+        res = val_fn(response, input)
+        print('input:\n', input)
+        print('response:\n', response)
+        print('validation result: ', res)
+    except Exception:
+        import traceback
+        traceback.print_exc()
+        print('failed')
+
+
+if __name__ == '__main__':
+    chap = 5
+    ques = 1
+    input = '请使用“盛 夏”、“蝉 鸣”、“少 年”、“橘 子味汽水”这几个词造句'
+    check_answer(chap - 1, ques - 1, input)
diff --git a/examples/pytorch/FILE_TRANSFER.md b/examples/pytorch/FILE_TRANSFER.md
new file mode 100644
index 000000000..690e3bcf9
--- /dev/null
+++ b/examples/pytorch/FILE_TRANSFER.md
@@ -0,0 +1,3 @@
+# NOTE
+
+`DiT_ImageNet_Demo.ipynb`, `SiT_ImageNet_Demo.ipynb`, `ViViT-demo.ipynb`, `UViT_ImageNet_demo.ipynb` are moved to the [modelscope-classroom repo](https://github.com/modelscope/modelscope-classroom)
diff --git a/examples/pytorch/application/qwen1.5_doc_search_QA_based_on_langchain.ipynb b/examples/pytorch/application/qwen1.5_doc_search_QA_based_on_langchain.ipynb
new file mode 100644
index 000000000..c8ba95556
--- /dev/null
+++ b/examples/pytorch/application/qwen1.5_doc_search_QA_based_on_langchain.ipynb
@@ -0,0 +1,431 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a33c5c7a-6d2f-4f38-b72a-ff5f07896184",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install llama-index llama-index-llms-huggingface ipywidgets\n",
+    "!pip install transformers -U"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "fd3b2a78-5782-4f76-8d09-52b6b07a96b8",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:49:50.997974Z",
+     "iopub.status.busy": "2024-02-21T05:49:50.997681Z",
+     "iopub.status.idle": "2024-02-21T05:49:54.378226Z",
+     "shell.execute_reply": "2024-02-21T05:49:54.377769Z",
+     "shell.execute_reply.started": "2024-02-21T05:49:50.997954Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-02-21 13:49:53,743 - modelscope - INFO - PyTorch version 2.1.2+cu121 Found.\n",
+      "2024-02-21 13:49:53,745 - modelscope - INFO - TensorFlow version 2.14.0 Found.\n",
+      "2024-02-21 13:49:53,746 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer\n",
+      "2024-02-21 13:49:53,746 - modelscope - INFO - No valid ast index found from /mnt/workspace/.cache/modelscope/ast_indexer, generating ast index from prebuilt!\n",
+      "2024-02-21 13:49:53,803 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 509123dba36c5e70a95f6780df348471 and a total number of 964 components indexed\n"
+     ]
+    }
+   ],
+   "source": [
+    "import logging\n",
+    "import sys\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
+    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
+    "\n",
+    "\n",
+    "from IPython.display import Markdown, display\n",
+    "import torch\n",
+    "from llama_index.llms.huggingface import HuggingFaceLLM\n",
+    "from llama_index.core.prompts import PromptTemplate\n",
+    "from modelscope import snapshot_download\n",
+    "from llama_index.core.base.embeddings.base import BaseEmbedding, Embedding\n",
+    "from abc import ABC\n",
+    "from typing import Any, List, Optional, Dict, cast\n",
+    "from llama_index.core import (\n",
+    "    VectorStoreIndex,\n",
+    "    ServiceContext,\n",
+    "    set_global_service_context,\n",
+    "    SimpleDirectoryReader,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c8375e4c-21c3-433c-a7b1-945007a73ac2",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:49:57.097256Z",
+     "iopub.status.busy": "2024-02-21T05:49:57.096804Z",
+     "iopub.status.idle": "2024-02-21T05:50:38.941821Z",
+     "shell.execute_reply": "2024-02-21T05:50:38.941368Z",
+     "shell.execute_reply.started": "2024-02-21T05:49:57.097233Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Downloading: 100%|██████████| 662/662 [00:00<00:00, 6.94MB/s]\n",
+      "Downloading: 100%|██████████| 51.0/51.0 [00:00<00:00, 586kB/s]\n",
+      "Downloading: 100%|██████████| 178/178 [00:00<00:00, 2.13MB/s]\n",
+      "Downloading: 100%|██████████| 1.59M/1.59M [00:00<00:00, 27.9MB/s]\n",
+      "Downloading: 100%|█████████▉| 3.72G/3.72G [00:08<00:00, 449MB/s]\n",
+      "Downloading: 100%|█████████▉| 3.64G/3.64G [00:11<00:00, 336MB/s]\n",
+      "Downloading: 100%|██████████| 38.7k/38.7k [00:00<00:00, 40.0MB/s]\n",
+      "Downloading: 100%|██████████| 4.13k/4.13k [00:00<00:00, 5.90MB/s]\n",
+      "Downloading: 100%|██████████| 6.70M/6.70M [00:00<00:00, 121MB/s]\n",
+      "Downloading: 100%|██████████| 1.13k/1.13k [00:00<00:00, 12.4MB/s]\n",
+      "Downloading: 100%|██████████| 2.65M/2.65M [00:00<00:00, 91.6MB/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n",
+      "We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "875c92489c8047c7881342f422f47c79",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Model names \n",
+    "qwen2_4B_CHAT = \"qwen/Qwen1.5-4B-Chat\"\n",
+    "\n",
+    "selected_model = snapshot_download(qwen2_4B_CHAT)\n",
+    "\n",
+    "SYSTEM_PROMPT = \"\"\"You are a helpful AI assistant.\n",
+    "\"\"\"\n",
+    "\n",
+    "query_wrapper_prompt = PromptTemplate(\n",
+    "    \"[INST]<<SYS>>\\n\" + SYSTEM_PROMPT + \"<</SYS>>\\n\\n{query_str}[/INST] \"\n",
+    ")\n",
+    "\n",
+    "llm = HuggingFaceLLM(\n",
+    "    context_window=4096,\n",
+    "    max_new_tokens=2048,\n",
+    "    generate_kwargs={\"temperature\": 0.0, \"do_sample\": False},\n",
+    "    query_wrapper_prompt=query_wrapper_prompt,\n",
+    "    tokenizer_name=selected_model,\n",
+    "    model_name=selected_model,\n",
+    "    device_map=\"auto\",\n",
+    "    # change these settings below depending on your GPU\n",
+    "    model_kwargs={\"torch_dtype\": torch.float16},\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "38d1acab-e916-459b-9a11-e39a63751d47",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:51:00.938021Z",
+     "iopub.status.busy": "2024-02-21T05:51:00.937708Z",
+     "iopub.status.idle": "2024-02-21T05:51:01.687136Z",
+     "shell.execute_reply": "2024-02-21T05:51:01.686435Z",
+     "shell.execute_reply.started": "2024-02-21T05:51:00.937998Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-02-21 13:51:01--  https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n",
+      "正在解析主机 modelscope.oss-cn-beijing.aliyuncs.com (modelscope.oss-cn-beijing.aliyuncs.com)... 8.131.208.119\n",
+      "正在连接 modelscope.oss-cn-beijing.aliyuncs.com (modelscope.oss-cn-beijing.aliyuncs.com)|8.131.208.119|:443... 已连接。\n",
+      "已发出 HTTP 请求，正在等待回应... 200 OK\n",
+      "长度： 13228 (13K) [text/markdown]\n",
+      "正在保存至: ‘data/xianjiaoda/xianjiaoda.md’\n",
+      "\n",
+      "data/xianjiaoda/xia 100%[===================>]  12.92K  --.-KB/s    用时 0s      \n",
+      "\n",
+      "2024-02-21 13:51:01 (31.7 MB/s) - 已保存 ‘data/xianjiaoda/xianjiaoda.md’ [13228/13228])\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!mkdir -p 'data/xianjiaoda/'\n",
+    "!wget 'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md' -O 'data/xianjiaoda/xianjiaoda.md'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "75ffc74f-a732-4748-8cb8-481cd8a39f81",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# load documents\n",
+    "documents = SimpleDirectoryReader(\"/mnt/workspace/data/xianjiaoda/\").load_data()\n",
+    "documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "5689eeaa-8d2c-4df5-9165-abde5d1b3702",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:51:07.044053Z",
+     "iopub.status.busy": "2024-02-21T05:51:07.043752Z",
+     "iopub.status.idle": "2024-02-21T05:51:07.051731Z",
+     "shell.execute_reply": "2024-02-21T05:51:07.051278Z",
+     "shell.execute_reply.started": "2024-02-21T05:51:07.044036Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n",
+    "class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):\n",
+    "    embed: Any = None\n",
+    "    model_id: str = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n",
+    "\n",
+    "    def __init__(\n",
+    "            self,\n",
+    "            model_id: str,\n",
+    "            **kwargs: Any,\n",
+    "    ) -> None:\n",
+    "        super().__init__(**kwargs)\n",
+    "        try:\n",
+    "            from modelscope.models import Model\n",
+    "            from modelscope.pipelines import pipeline\n",
+    "            from modelscope.utils.constant import Tasks\n",
+    "            # 使用modelscope的embedding模型（包含下载）\n",
+    "            self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)\n",
+    "\n",
+    "        except ImportError as e:\n",
+    "            raise ValueError(\n",
+    "                \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n",
+    "            ) from e\n",
+    "\n",
+    "    def _get_query_embedding(self, query: str) -> List[float]:\n",
+    "        text = query.replace(\"\\n\", \" \")\n",
+    "        inputs = {\"source_sentence\": [text]}\n",
+    "        return self.embed(input=inputs)['text_embedding'][0].tolist()\n",
+    "\n",
+    "    def _get_text_embedding(self, text: str) -> List[float]:\n",
+    "        text = text.replace(\"\\n\", \" \")\n",
+    "        inputs = {\"source_sentence\": [text]}\n",
+    "        return self.embed(input=inputs)['text_embedding'][0].tolist()\n",
+    "\n",
+    "    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:\n",
+    "        texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n",
+    "        inputs = {\"source_sentence\": texts}\n",
+    "        return self.embed(input=inputs)['text_embedding'].tolist()\n",
+    "\n",
+    "    async def _aget_query_embedding(self, query: str) -> List[float]:\n",
+    "        return self._get_query_embedding(query)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "8590cf73-bb5b-498c-993d-d24f15aad77e",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:51:09.906919Z",
+     "iopub.status.busy": "2024-02-21T05:51:09.906610Z",
+     "iopub.status.idle": "2024-02-21T05:51:17.813191Z",
+     "shell.execute_reply": "2024-02-21T05:51:17.812713Z",
+     "shell.execute_reply.started": "2024-02-21T05:51:09.906901Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "INFO:datasets:PyTorch version 2.1.2+cu121 available.\n",
+      "PyTorch version 2.1.2+cu121 available.\n",
+      "INFO:datasets:TensorFlow version 2.14.0 available.\n",
+      "TensorFlow version 2.14.0 available.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-02-21 13:51:10,907 - modelscope - WARNING - Model revision not specified, use revision: v1.1.0\n",
+      "Downloading: 100%|██████████| 917/917 [00:00<00:00, 6.18MB/s]\n",
+      "Downloading: 100%|██████████| 2.29k/2.29k [00:00<00:00, 23.5MB/s]\n",
+      "Downloading: 100%|██████████| 60.7k/60.7k [00:00<00:00, 26.3MB/s]\n",
+      "Downloading: 100%|██████████| 195M/195M [00:00<00:00, 383MB/s] \n",
+      "Downloading: 100%|██████████| 11.4k/11.4k [00:00<00:00, 40.4MB/s]\n",
+      "Downloading: 100%|██████████| 125/125 [00:00<00:00, 684kB/s]\n",
+      "Downloading: 100%|██████████| 429k/429k [00:00<00:00, 20.8MB/s]\n",
+      "Downloading: 100%|██████████| 366/366 [00:00<00:00, 4.25MB/s]\n",
+      "2024-02-21 13:51:15,095 - modelscope - INFO - initiate model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base\n",
+      "2024-02-21 13:51:15,096 - modelscope - INFO - initiate model from location /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base.\n",
+      "2024-02-21 13:51:15,096 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base\n",
+      "/opt/conda/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n",
+      "2024-02-21 13:51:15,741 - modelscope - WARNING - No preprocessor field found in cfg.\n",
+      "2024-02-21 13:51:15,742 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n",
+      "2024-02-21 13:51:15,742 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base'}. trying to build by task and model information.\n",
+      "2024-02-21 13:51:15,762 - modelscope - WARNING - No preprocessor field found in cfg.\n",
+      "2024-02-21 13:51:15,762 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n",
+      "2024-02-21 13:51:15,763 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base', 'sequence_length': 128}. trying to build by task and model information.\n",
+      "/tmp/ipykernel_442/427817804.py:2: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n",
+      "  service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n",
+      "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py:993: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n",
+    "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n",
+    "set_global_service_context(service_context)\n",
+    "\n",
+    "index = VectorStoreIndex.from_documents(documents)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "df218d21-9ad1-42f3-b44c-47aa56f6edcf",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:51:20.557315Z",
+     "iopub.status.busy": "2024-02-21T05:51:20.556991Z",
+     "iopub.status.idle": "2024-02-21T05:51:20.610136Z",
+     "shell.execute_reply": "2024-02-21T05:51:20.609707Z",
+     "shell.execute_reply.started": "2024-02-21T05:51:20.557297Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# set Logging to DEBUG for more detailed outputs\n",
+    "query_engine = index.as_query_engine()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "10c8c01f-c923-4234-a93e-c37a39358f5b",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-21T05:59:18.934204Z",
+     "iopub.status.busy": "2024-02-21T05:59:18.933908Z",
+     "iopub.status.idle": "2024-02-21T05:59:19.777534Z",
+     "shell.execute_reply": "2024-02-21T05:59:19.777054Z",
+     "shell.execute_reply.started": "2024-02-21T05:59:18.934187Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并，组建新的西安交通大学\n"
+     ]
+    }
+   ],
+   "source": [
+    "response = query_engine.query(\"西安交大是由哪几个学校合并的?\")\n",
+    "print(response)\n",
+    "#display(Markdown(f\"<b>{response}</b>\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb
new file mode 100644
index 000000000..e6ddabfd5
--- /dev/null
+++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb
@@ -0,0 +1,326 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Usage\n",
+    "1. Install python dependencies\n",
+    "```shell\n",
+    "!pip install pypdf langchain unstructured transformers_stream_generator\n",
+    "!pip install modelscope  nltk pydantic  tiktoken  llama-index\n",
+    "```\n",
+    "\n",
+    "2. Download data files we need in this example\n",
+    "```shell\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/averaged_perceptron_tagger.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n",
+    "\n",
+    "!mkdir -p /root/nltk_data/tokenizers\n",
+    "!mkdir -p /root/nltk_data/taggers\n",
+    "!cp /mnt/workspace/punkt.zip /root/nltk_data/tokenizers\n",
+    "!cp /mnt/workspace/averaged_perceptron_tagger.zip /root/nltk_data/taggers\n",
+    "!cd /root/nltk_data/tokenizers; unzip punkt.zip;\n",
+    "!cd /root/nltk_data/taggers; unzip averaged_perceptron_tagger.zip;\n",
+    "\n",
+    "!mkdir -p /mnt/workspace/custom_data\n",
+    "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n",
+    "\n",
+    "!cd /mnt/workspace\n",
+    "``` \n",
+    "\n",
+    "3. Enjoy your QA AI"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "8230365523c9330a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a407764-9392-48ae-9bed-8c73c9f76fbc",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2024-01-16T08:58:56.323000Z",
+     "iopub.status.busy": "2024-01-16T08:58:56.322690Z",
+     "iopub.status.idle": "2024-01-16T08:59:57.862755Z",
+     "shell.execute_reply": "2024-01-16T08:59:57.862041Z",
+     "shell.execute_reply.started": "2024-01-16T08:58:56.322980Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!pip install pypdf langchain unstructured transformers_stream_generator\n",
+    "!pip install modelscope  nltk pydantic  tiktoken  llama-index"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "696c6b78-53e8-4135-8376-ce8902b7d79a",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-01-16T09:04:59.193375Z",
+     "iopub.status.busy": "2024-01-16T09:04:59.193082Z",
+     "iopub.status.idle": "2024-01-16T09:05:00.971449Z",
+     "shell.execute_reply": "2024-01-16T09:05:00.970857Z",
+     "shell.execute_reply.started": "2024-01-16T09:04:59.193357Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/averaged_perceptron_tagger.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n",
+    "\n",
+    "!mkdir -p /root/nltk_data/tokenizers\n",
+    "!mkdir -p /root/nltk_data/taggers\n",
+    "!cp /mnt/workspace/punkt.zip /root/nltk_data/tokenizers\n",
+    "!cp /mnt/workspace/averaged_perceptron_tagger.zip /root/nltk_data/taggers\n",
+    "!cd /root/nltk_data/tokenizers; unzip punkt.zip;\n",
+    "!cd /root/nltk_data/taggers; unzip averaged_perceptron_tagger.zip;\n",
+    "\n",
+    "!mkdir -p /mnt/workspace/custom_data\n",
+    "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n",
+    "\n",
+    "!cd /mnt/workspace"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1cb8feca-c71f-4ad6-8eff-caae95411aa0",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-01-16T09:06:03.024995Z",
+     "iopub.status.busy": "2024-01-16T09:06:03.024622Z",
+     "iopub.status.idle": "2024-01-16T09:09:15.894774Z",
+     "shell.execute_reply": "2024-01-16T09:09:15.894230Z",
+     "shell.execute_reply.started": "2024-01-16T09:06:03.024974Z"
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from abc import ABC\n",
+    "from typing import Any, List, Optional, Dict, cast\n",
+    "\n",
+    "import torch\n",
+    "from langchain_core.language_models.llms import LLM\n",
+    "from langchain_core.output_parsers import StrOutputParser\n",
+    "from langchain_core.prompts import ChatPromptTemplate\n",
+    "from langchain_core.runnables import RunnablePassthrough\n",
+    "from modelscope import AutoModelForCausalLM, AutoTokenizer\n",
+    "from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader\n",
+    "from llama_index import ServiceContext\n",
+    "from llama_index.embeddings.base import BaseEmbedding\n",
+    "from llama_index import set_global_service_context\n",
+    "from langchain_core.retrievers import BaseRetriever\n",
+    "from langchain_core.callbacks import CallbackManagerForRetrieverRun\n",
+    "from langchain_core.documents import Document\n",
+    "from llama_index.retrievers import VectorIndexRetriever\n",
+    "\n",
+    "# configs for LLM\n",
+    "llm_name = \"Qwen/Qwen-1_8B-Chat\"\n",
+    "llm_revision = \"master\"\n",
+    "\n",
+    "# configs for embedding model\n",
+    "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-small\"\n",
+    "\n",
+    "# file path for your custom knowledge base\n",
+    "knowledge_doc_file_dir = \"/mnt/workspace/custom_data/\"\n",
+    "knowledge_doc_file_path = knowledge_doc_file_dir + \"xianjiaoda.md\"\n",
+    "\n",
+    "\n",
+    "# define our Embedding class to use models in Modelscope\n",
+    "class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):\n",
+    "    embed: Any = None\n",
+    "    model_id: str = \"damo/nlp_gte_sentence-embedding_chinese-small\"\n",
+    "\n",
+    "    def __init__(\n",
+    "            self,\n",
+    "            model_id: str,\n",
+    "            **kwargs: Any,\n",
+    "    ) -> None:\n",
+    "        super().__init__(**kwargs)\n",
+    "        try:\n",
+    "            from modelscope.models import Model\n",
+    "            from modelscope.pipelines import pipeline\n",
+    "            from modelscope.utils.constant import Tasks\n",
+    "            self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)\n",
+    "\n",
+    "        except ImportError as e:\n",
+    "            raise ValueError(\n",
+    "                \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n",
+    "            ) from e\n",
+    "\n",
+    "    def _get_query_embedding(self, query: str) -> List[float]:\n",
+    "        text = query.replace(\"\\n\", \" \")\n",
+    "        inputs = {\"source_sentence\": [text]}\n",
+    "        return self.embed(input=inputs)['text_embedding'][0]\n",
+    "\n",
+    "    def _get_text_embedding(self, text: str) -> List[float]:\n",
+    "        text = text.replace(\"\\n\", \" \")\n",
+    "        inputs = {\"source_sentence\": [text]}\n",
+    "        return self.embed(input=inputs)['text_embedding'][0]\n",
+    "\n",
+    "    def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:\n",
+    "        texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n",
+    "        inputs = {\"source_sentence\": texts}\n",
+    "        return self.embed(input=inputs)['text_embedding']\n",
+    "\n",
+    "    async def _aget_query_embedding(self, query: str) -> List[float]:\n",
+    "        return self._get_query_embedding(query)\n",
+    "\n",
+    "\n",
+    "# define our Retriever with llama-index to co-operate with Langchain\n",
+    "# note that the 'LlamaIndexRetriever' defined in langchain-community.retrievers.llama_index.py\n",
+    "# is no longer compatible with llamaIndex code right now.\n",
+    "class LlamaIndexRetriever(BaseRetriever):\n",
+    "    index: Any\n",
+    "    \"\"\"LlamaIndex index to query.\"\"\"\n",
+    "\n",
+    "    def _get_relevant_documents(\n",
+    "        self, query: str, *, run_manager: CallbackManagerForRetrieverRun\n",
+    "    ) -> List[Document]:\n",
+    "        \"\"\"Get documents relevant for a query.\"\"\"\n",
+    "        try:\n",
+    "            from llama_index.indices.base import BaseIndex\n",
+    "            from llama_index.response.schema import Response\n",
+    "        except ImportError:\n",
+    "            raise ImportError(\n",
+    "                \"You need to install `pip install llama-index` to use this retriever.\"\n",
+    "            )\n",
+    "        index = cast(BaseIndex, self.index)\n",
+    "        print('@@@ query=', query)\n",
+    "\n",
+    "        response = index.as_query_engine().query(query)\n",
+    "        response = cast(Response, response)\n",
+    "        # parse source nodes\n",
+    "        docs = []\n",
+    "        for source_node in response.source_nodes:\n",
+    "            print('@@@@ source=', source_node)\n",
+    "            metadata = source_node.metadata or {}\n",
+    "            docs.append(\n",
+    "                Document(page_content=source_node.get_text(), metadata=metadata)\n",
+    "            )\n",
+    "        return docs\n",
+    "\n",
+    "def torch_gc():\n",
+    "    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
+    "    DEVICE = \"cuda\"\n",
+    "    DEVICE_ID = \"0\"\n",
+    "    CUDA_DEVICE = f\"{DEVICE}:{DEVICE_ID}\" if DEVICE_ID else DEVICE\n",
+    "    a = torch.Tensor([1, 2])\n",
+    "    a = a.cuda()\n",
+    "    print(a)\n",
+    "\n",
+    "    if torch.cuda.is_available():\n",
+    "        with torch.cuda.device(CUDA_DEVICE):\n",
+    "            torch.cuda.empty_cache()\n",
+    "            torch.cuda.ipc_collect()\n",
+    "\n",
+    "\n",
+    "# global resources used by QianWenChatLLM (this is not a good practice)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(llm_name, revision=llm_revision, trust_remote_code=True)\n",
+    "model = AutoModelForCausalLM.from_pretrained(llm_name, revision=llm_revision, device_map=\"auto\",\n",
+    "                                             trust_remote_code=True, fp16=True).eval()\n",
+    "\n",
+    "\n",
+    "# define QianWen LLM based on langchain's LLM to use models in Modelscope\n",
+    "class QianWenChatLLM(LLM):\n",
+    "    max_length = 10000\n",
+    "    temperature: float = 0.01\n",
+    "    top_p = 0.9\n",
+    "\n",
+    "    def __init__(self):\n",
+    "        super().__init__()\n",
+    "\n",
+    "    @property\n",
+    "    def _llm_type(self):\n",
+    "        return \"ChatLLM\"\n",
+    "\n",
+    "    def _call(\n",
+    "            self,\n",
+    "            prompt: str,\n",
+    "            stop: Optional[List[str]] = None,\n",
+    "            run_manager=None,\n",
+    "            **kwargs: Any,\n",
+    "    ) -> str:\n",
+    "        print(prompt)\n",
+    "        response, history = model.chat(tokenizer, prompt, history=None)\n",
+    "        torch_gc()\n",
+    "        return response\n",
+    "\n",
+    "\n",
+    "# STEP1: create LLM instance\n",
+    "qwllm = QianWenChatLLM()\n",
+    "print('STEP1: qianwen LLM created')\n",
+    "\n",
+    "# STEP2: load knowledge file and initialize vector db by llamaIndex\n",
+    "print('STEP2: reading docs ...')\n",
+    "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n",
+    "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=None)\n",
+    "set_global_service_context(service_context)     # global config, not good\n",
+    "\n",
+    "llamaIndex_docs = SimpleDirectoryReader(knowledge_doc_file_dir).load_data()\n",
+    "llamaIndex_index = GPTVectorStoreIndex.from_documents(llamaIndex_docs, chunk_size=512)\n",
+    "retriever = LlamaIndexRetriever(index=llamaIndex_index)\n",
+    "print(' 2.2 reading doc done, vec db created.')\n",
+    "\n",
+    "# STEP3: create chat template\n",
+    "prompt_template = \"\"\"请基于```内的内容回答问题。\"\n",
+    "```\n",
+    "{context}\n",
+    "```\n",
+    "我的问题是：{question}。\n",
+    "\"\"\"\n",
+    "prompt = ChatPromptTemplate.from_template(template=prompt_template)\n",
+    "print('STEP3: chat prompt template created.')\n",
+    "\n",
+    "# STEP4: create RAG chain to do QA\n",
+    "chain = (\n",
+    "        {\"context\": retriever, \"question\": RunnablePassthrough()}\n",
+    "        | prompt\n",
+    "        | qwllm\n",
+    "        | StrOutputParser()\n",
+    ")\n",
+    "chain.invoke('西安交大的校训是什么？')\n",
+    "# chain.invoke('魔搭社区有哪些模型?')\n",
+    "# chain.invoke('modelscope是什么?')\n",
+    "# chain.invoke('萧峰和乔峰是什么关系?')\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb
new file mode 100644
index 000000000..194c46a20
--- /dev/null
+++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb
@@ -0,0 +1,223 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "source": [
+    "# Usage\n",
+    "\n",
+    "## 1. Install necessary libs\n",
+    "```shell\n",
+    "!pip install modelscope\n",
+    "!pip install transformers -U\n",
+    "!pip install llama-index llama-index-llms-huggingface ipywidgets \n",
+    "```\n",
+    "\n",
+    "## 2. Download data files we need in this example\n",
+    "```shell\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/stopwords.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n",
+    "\n",
+    "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n",
+    "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n",
+    "\n",
+    "!cp /mnt/workspace/punkt.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n",
+    "!cp /mnt/workspace/stopwords.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n",
+    "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers; unzip punkt.zip;\n",
+    "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora; unzip stopwords.zip;\n",
+    "\n",
+    "\n",
+    "!mkdir -p /mnt/workspace/custom_data\n",
+    "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n",
+    "\n",
+    "!cd /mnt/workspace\n",
+    "```\n",
+    "\n",
+    "## 3. Go!"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f4abc589d9bfffca"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "!pip install modelscope\n",
+    "!pip install transformers -U\n",
+    "!pip install llama-index llama-index-llms-huggingface ipywidgets "
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "c32122833dd7b8c8"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/stopwords.zip\n",
+    "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n",
+    "\n",
+    "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n",
+    "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n",
+    "\n",
+    "!cp /mnt/workspace/punkt.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n",
+    "!cp /mnt/workspace/stopwords.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n",
+    "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers; unzip punkt.zip;\n",
+    "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora; unzip stopwords.zip;\n",
+    "\n",
+    "\n",
+    "!mkdir -p /mnt/workspace/custom_data\n",
+    "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n",
+    "\n",
+    "!cd /mnt/workspace"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "63704e2b21a9ba52"
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "import logging\n",
+    "import sys\n",
+    "from abc import ABC\n",
+    "from typing import Any, List\n",
+    "\n",
+    "import torch\n",
+    "from llama_index.core import (\n",
+    "    SimpleDirectoryReader,\n",
+    "    VectorStoreIndex,\n",
+    "    Settings,\n",
+    "    ServiceContext,\n",
+    "    set_global_service_context,\n",
+    ")\n",
+    "from llama_index.core.base.embeddings.base import BaseEmbedding, Embedding\n",
+    "from llama_index.core.prompts import PromptTemplate\n",
+    "from llama_index.llms.huggingface import HuggingFaceLLM\n",
+    "\n",
+    "from modelscope import snapshot_download\n",
+    "\n",
+    "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n",
+    "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n",
+    "\n",
+    "# download QWEN model from modelscope\n",
+    "qwen15_4B_CHAT = \"qwen/Qwen1.5-4B-Chat\"\n",
+    "selected_model = snapshot_download(qwen15_4B_CHAT)\n",
+    "\n",
+    "# define sys prompt\n",
+    "SYSTEM_PROMPT = \"\"\"You are a helpful AI assistant.\"\"\"\n",
+    "query_wrapper_prompt = PromptTemplate(\n",
+    "    \"[INST]<<SYS>>\\n\" + SYSTEM_PROMPT + \"<</SYS>>\\n\\n{query_str}[/INST] \"\n",
+    ")\n",
+    "\n",
+    "# create HuggingFaceLLM with qwen1.5 \n",
+    "llm = HuggingFaceLLM(\n",
+    "    context_window=4096,\n",
+    "    max_new_tokens=2048,\n",
+    "    generate_kwargs={\"temperature\": 0.0, \"do_sample\": False},\n",
+    "    query_wrapper_prompt=query_wrapper_prompt,\n",
+    "    tokenizer_name=selected_model,\n",
+    "    model_name=selected_model,\n",
+    "    device_map=\"auto\",\n",
+    "    # change these settings below depending on your GPU\n",
+    "    model_kwargs={\"torch_dtype\": torch.float16},\n",
+    ")\n",
+    "print(\"llm created\")\n",
+    "\n",
+    "\n",
+    "# wrap modelscope embedding for llama-index (based on BaseEmbedding)\n",
+    "class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):\n",
+    "    embed: Any = None\n",
+    "    model_id: str = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n",
+    "\n",
+    "    def __init__(\n",
+    "            self,\n",
+    "            model_id: str,\n",
+    "            **kwargs: Any,\n",
+    "    ) -> None:\n",
+    "        super().__init__(**kwargs)\n",
+    "        try:\n",
+    "            from modelscope.models import Model\n",
+    "            from modelscope.pipelines import pipeline\n",
+    "            from modelscope.utils.constant import Tasks\n",
+    "            # 使用modelscope的embedding模型（包含下载）\n",
+    "            self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)\n",
+    "\n",
+    "        except ImportError as e:\n",
+    "            raise ValueError(\n",
+    "                \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n",
+    "            ) from e\n",
+    "\n",
+    "    def _get_query_embedding(self, query: str) -> Embedding:\n",
+    "        text = query.replace(\"\\n\", \" \")\n",
+    "        inputs = {\"source_sentence\": [text]}\n",
+    "        # note that we have to call tolist() to change numpy.ndarray into python list\n",
+    "        return self.embed(input=inputs)['text_embedding'][0].tolist()\n",
+    "\n",
+    "    def _get_text_embedding(self, text: str) -> Embedding:\n",
+    "        text = text.replace(\"\\n\", \" \")\n",
+    "        inputs = {\"source_sentence\": [text]}\n",
+    "        return self.embed(input=inputs)['text_embedding'][0].tolist()\n",
+    "\n",
+    "    def _get_text_embeddings(self, texts: List[str]) -> List[Embedding]:\n",
+    "        texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n",
+    "        inputs = {\"source_sentence\": texts}\n",
+    "        return self.embed(input=inputs)['text_embedding'].tolist()\n",
+    "\n",
+    "    async def _aget_query_embedding(self, query: str) -> Embedding:\n",
+    "        return self._get_query_embedding(query)\n",
+    "\n",
+    "\n",
+    "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n",
+    "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n",
+    "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n",
+    "set_global_service_context(service_context)\n",
+    "Settings.embed_model = embeddings\n",
+    "\n",
+    "# load example documents\n",
+    "documents = SimpleDirectoryReader(\"/mnt/workspace/custom_data/\").load_data()\n",
+    "\n",
+    "# create Vector DB\n",
+    "index = VectorStoreIndex.from_documents(documents)\n",
+    "\n",
+    "# set Logging to DEBUG for more detailed outputs\n",
+    "query_engine = index.as_query_engine()\n",
+    "\n",
+    "# do query\n",
+    "response = query_engine.query(\"西安较大的校训是什么\")\n",
+    "print(response)\n"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "eef67659e94045c5"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git "a/examples/pytorch/stable_diffusion/SD\346\216\250\347\220\206\346\234\200\344\275\263\345\256\236\350\267\265.ipynb" "b/examples/pytorch/stable_diffusion/SD\346\216\250\347\220\206\346\234\200\344\275\263\345\256\236\350\267\265.ipynb"
new file mode 100644
index 000000000..234859a7f
--- /dev/null
+++ "b/examples/pytorch/stable_diffusion/SD\346\216\250\347\220\206\346\234\200\344\275\263\345\256\236\350\267\265.ipynb"
@@ -0,0 +1,358 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "89373920-4a59-473e-8b7d-7f30570637c7",
+   "metadata": {},
+   "source": [
+    "Stable diffusion模型推理方法1：SDXL模型，魔搭社区Pipeline已经集成SDXL模型，可以直接使用"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "641a04c4-ee0b-4cef-93e2-bca0269e7486",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from modelscope.utils.constant import Tasks\n",
+    "from modelscope.pipelines import pipeline\n",
+    "import cv2\n",
+    "\n",
+    "pipe = pipeline(task=Tasks.text_to_image_synthesis, \n",
+    "                model='AI-ModelScope/stable-diffusion-xl-base-1.0',\n",
+    "                use_safetensors=True,\n",
+    "                model_revision='v1.0.0')\n",
+    "\n",
+    "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n",
+    "output = pipe({'text': prompt})\n",
+    "cv2.imwrite('SDXL.png', output['output_imgs'][0])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c5740ed4-2c6a-4b0b-8bb7-6ef466d2a08f",
+   "metadata": {},
+   "source": [
+    "秒级推理方法1：SDXL-turbo模型是SDXL 1.0的蒸馏版本，SDXL-Turbo基于一种称之为对抗扩散蒸馏（ADD）的新颖的训练方法，这种方法在扩散模型采样可以减少到1到4步，而生成高质量图像。ADD的训练方式使用得分蒸馏，利用大规模扩散模型作为教师模型，并将其与对抗性损失相结合，即使在1-2步的采样步骤的低步骤状态下，使用对抗学习的方式，引入discriminator来辅助生成质量的把控，也可以确保高质量图像的保真度。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bef68ad6-1fc9-4fff-850e-9bd4cc3ef756",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from diffusers import AutoPipelineForText2Image\n",
+    "import torch\n",
+    "from modelscope import snapshot_download\n",
+    "\n",
+    "model_dir = snapshot_download(\"AI-ModelScope/sdxl-turbo\")\n",
+    "\n",
+    "pipe = AutoPipelineForText2Image.from_pretrained(model_dir, torch_dtype=torch.float16, variant=\"fp16\")\n",
+    "pipe.to(\"cuda\")\n",
+    "\n",
+    "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n",
+    "\n",
+    "image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]\n",
+    "image.save(\"SDXLturbo.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bf25d186-317e-4e53-bed5-c801b336b3ff",
+   "metadata": {},
+   "source": [
+    "秒级推理方法2：SDXL+LCM，潜在一致性模型（LCM）受一致性模型（CM）启发，在预训练的LDM上以较少的步骤进行快速推理。LCM-SD系列是在Stable Diffusion的基础上新增Consistency 约束蒸馏的结果，仅通过2-8步的推理即可实现高质量的文本到图片的生成性能。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58e1b7b6-f2d1-4a04-9a31-108f567b5c64",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler\n",
+    "import torch\n",
+    "from modelscope import snapshot_download\n",
+    "\n",
+    "model_dir_lcm = snapshot_download(\"AI-ModelScope/lcm-sdxl\",revision = \"master\")\n",
+    "model_dir_sdxl = snapshot_download(\"AI-ModelScope/stable-diffusion-xl-base-1.0\",revision = \"v1.0.9\")\n",
+    "\n",
+    "unet = UNet2DConditionModel.from_pretrained(model_dir_lcm, torch_dtype=torch.float16, variant=\"fp16\")\n",
+    "pipe = DiffusionPipeline.from_pretrained(model_dir_sdxl, unet=unet, torch_dtype=torch.float16, variant=\"fp16\")\n",
+    "\n",
+    "pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)\n",
+    "pipe.to(\"cuda\")\n",
+    "\n",
+    "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n",
+    "image = pipe(prompt, num_inference_steps=4, guidance_scale=8.0).images[0]\n",
+    "image.save(\"SDXLLCM.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ec6a4dda-2d8c-4fb5-bcbd-468462d9e3c6",
+   "metadata": {},
+   "source": [
+    "秒级推理方法3：stable-cascade模型基于Würstchen架构构建，与稳定扩散等其他模型的主要区别在于它在更小的潜在空间中工作。潜在空间越小，推理速度就越快，训练成本也就越低。潜在空间有多小？稳定扩散使用压缩因子 8，从而将 1024x1024 图像编码为 128x128。Stable Cascade 的压缩系数为 42，这意味着可以将 1024x1024 图像编码为 24x24，同时保持清晰的重建。然后在高度压缩的潜在空间中训练文本条件模型。与稳定扩散 1.5 相比，该架构的先前版本实现了 16 倍的成本降低。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4155f18d-0504-42e6-b785-02ed4a519c1f",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from modelscope import snapshot_download\n",
+    "from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline\n",
+    "\n",
+    "device = \"cuda\"\n",
+    "num_images_per_prompt = 1\n",
+    "\n",
+    "stable_cascade_prior = snapshot_download(\"AI-ModelScope/stable-cascade-prior\")\n",
+    "stable_cascade = snapshot_download(\"AI-ModelScope/stable-cascade\")\n",
+    "\n",
+    "prior = StableCascadePriorPipeline.from_pretrained(stable_cascade_prior, torch_dtype=torch.bfloat16).to(device)\n",
+    "decoder = StableCascadeDecoderPipeline.from_pretrained(stable_cascade,  torch_dtype=torch.float16).to(device)\n",
+    "\n",
+    "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n",
+    "negative_prompt = \"\"\n",
+    "\n",
+    "prior_output = prior(\n",
+    "    prompt=prompt,\n",
+    "    height=1024,\n",
+    "    width=1024,\n",
+    "    negative_prompt=negative_prompt,\n",
+    "    guidance_scale=4.0,\n",
+    "    num_images_per_prompt=num_images_per_prompt,\n",
+    "    num_inference_steps=20\n",
+    ")\n",
+    "decoder_output = decoder(\n",
+    "    image_embeddings=prior_output.image_embeddings.half(),\n",
+    "    prompt=prompt,\n",
+    "    negative_prompt=negative_prompt,\n",
+    "    guidance_scale=0.0,\n",
+    "    output_type=\"pil\",\n",
+    "    num_inference_steps=10\n",
+    ").images\n",
+    "\n",
+    "for i, img in enumerate(decoder_output):\n",
+    "    img.save(f\"stablecascade_{i+1}.png\")\n",
+    "#Now decoder_output is a list with your PIL images"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c402e461-2245-4e38-839b-6a5992c03b00",
+   "metadata": {},
+   "source": [
+    "秒级推理方法4："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f42531c8-c428-4ae7-aef1-b56050bffc71",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler\n",
+    "from modelscope.hub.file_download import model_file_download\n",
+    "from modelscope import snapshot_download\n",
+    "from safetensors.torch import load_file\n",
+    "\n",
+    "base = snapshot_download(\"AI-ModelScope/stable-diffusion-xl-base-1.0\")\n",
+    "repo = \"AI-ModelScope/SDXL-Lightning\"\n",
+    "ckpt = \"sdxl_lightning_4step_unet.safetensors\" # Use the correct ckpt for your step setting!\n",
+    "\n",
+    "# Load model.\n",
+    "unet = UNet2DConditionModel.from_config(base, subfolder=\"unet\").to(\"cuda\", torch.float16)\n",
+    "unet.load_state_dict(load_file(model_file_download(repo, ckpt), device=\"cuda\"))\n",
+    "pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant=\"fp16\").to(\"cuda\")\n",
+    "\n",
+    "# Ensure sampler uses \"trailing\" timesteps.\n",
+    "pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing=\"trailing\")\n",
+    "\n",
+    "# Ensure using the same inference steps as the loaded model and CFG set to 0.\n",
+    "pipe(\"A girl smiling\", num_inference_steps=4, guidance_scale=0).images[0].save(\"sdxllightning.png\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "adbedb78-90fb-4509-a3a6-6262d0d51bcf",
+   "metadata": {},
+   "source": [
+    "微调lora叠加推理"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c418dc94-6c35-4ac2-8807-e796d5488525",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from diffusers import AutoPipelineForText2Image\n",
+    "from modelscope import snapshot_download\n",
+    "import torch\n",
+    "\n",
+    "model_dir=snapshot_download(\"YorickHe/majicmixRealistic_v6\")\n",
+    "lora_dir = snapshot_download(\"PaperCloud/zju19_dunhuang_style_lora\")\n",
+    "\n",
+    "pipeline = AutoPipelineForText2Image.from_pretrained(f\"{model_dir}/v7\", torch_dtype=torch.float16).to(\"cuda\")\n",
+    "pipeline.load_lora_weights(lora_dir, weight_name=\"dunhuang.safetensors\")\n",
+    "prompt = \"1 girl, close-up, waist shot, black long hair, clean face, dunhuang, Chinese ancient style, clean skin, organza_lace, Dunhuang wind, Art deco, Necklace, jewelry, Bracelet, Earrings, dunhuang_style, see-through_dress, Expressionism, looking towards the camera, upper_body, raw photo, masterpiece, solo, medium shot, high detail face, photorealistic, best quality\"\n",
+    "#Negative Prompt = \"\"\"(nsfw:2), paintings, sketches, (worst quality:2), (low quality:2), lowers, normal quality, ((monochrome)), ((grayscale)), logo, word, character, bad hand, tattoo, (username, watermark, signature, time signature, timestamp, artist name, copyright name, copyright),low res, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, glans, extra fingers, fewer fingers, strange fingers, bad hand, mole, ((extra legs)), ((extra hands))\"\"\"\n",
+    "image = pipeline(prompt).images[0]\n",
+    "image.save(\"sdlora.png\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c36c14f-9481-48f1-a6ef-617d7551b63d",
+   "metadata": {},
+   "source": [
+    "SD+controlnet"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "1f1c616d-0d45-4a8d-8140-0b6b352920b9",
+   "metadata": {
+    "ExecutionIndicator": {
+     "show": true
+    },
+    "execution": {
+     "iopub.execute_input": "2024-02-28T00:22:32.730370Z",
+     "iopub.status.busy": "2024-02-28T00:22:32.729999Z",
+     "iopub.status.idle": "2024-02-28T00:23:48.650291Z",
+     "shell.execute_reply": "2024-02-28T00:23:48.649123Z",
+     "shell.execute_reply.started": "2024-02-28T00:22:32.730354Z"
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n",
+      "2024-02-28 08:22:35.104069: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-02-28 08:22:35.132215: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2024-02-28 08:22:35.174367: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "2024-02-28 08:22:35.174385: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "2024-02-28 08:22:35.174411: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2024-02-28 08:22:35.182970: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n",
+      "2024-02-28 08:22:35.183413: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n",
+      "2024-02-28 08:22:36.189620: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n",
+      "2024-02-28 08:22:39,294 - modelscope - INFO - PyTorch version 2.1.2+cu121 Found.\n",
+      "2024-02-28 08:22:39,296 - modelscope - INFO - TensorFlow version 2.14.0 Found.\n",
+      "2024-02-28 08:22:39,296 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer\n",
+      "2024-02-28 08:22:39,341 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 509123dba36c5e70a95f6780df348471 and a total number of 964 components indexed\n",
+      "2024-02-28 08:22:39,713 - modelscope - WARNING - Model revision not specified, use revision: v1.0.9\n",
+      "Loading pipeline components...: 100%|██████████| 7/7 [00:36<00:00,  5.19s/it]\n",
+      "100%|██████████| 50/50 [00:15<00:00,  3.24it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL\n",
+    "from diffusers.utils import load_image, make_image_grid\n",
+    "from PIL import Image\n",
+    "from modelscope import snapshot_download\n",
+    "import cv2\n",
+    "import numpy as np\n",
+    "import torch\n",
+    "\n",
+    "\n",
+    "model_dir = snapshot_download(\"AI-ModelScope/stable-diffusion-xl-base-1.0\")\n",
+    "controlnet_dir = snapshot_download(\"AI-ModelScope/controlnet-canny-sdxl-1.0\")\n",
+    "VAE_dir = snapshot_download(\"AI-ModelScope/sdxl-vae-fp16-fix\")\n",
+    "original_image = load_image(\n",
+    "    \"/mnt/workspace/canny.jpg\"\n",
+    ")\n",
+    "\n",
+    "prompt = \"sea turtle, hard lighting\"\n",
+    "negative_prompt = 'low quality, bad quality, sketches'\n",
+    "\n",
+    "image = load_image(\"/mnt/workspace/canny.jpg\")\n",
+    "\n",
+    "controlnet_conditioning_scale = 0.5  # recommended for good generalization\n",
+    "\n",
+    "controlnet = ControlNetModel.from_pretrained(\n",
+    "    controlnet_dir,\n",
+    "    torch_dtype=torch.float16\n",
+    ")\n",
+    "vae = AutoencoderKL.from_pretrained(VAE_dir, torch_dtype=torch.float16)\n",
+    "pipe = StableDiffusionXLControlNetPipeline.from_pretrained(\n",
+    "    model_dir,\n",
+    "    controlnet=controlnet,\n",
+    "    vae=vae,\n",
+    "    torch_dtype=torch.float16,\n",
+    ")\n",
+    "pipe.enable_model_cpu_offload()\n",
+    "\n",
+    "image = np.array(image)\n",
+    "image = cv2.Canny(image, 100, 200)\n",
+    "image = image[:, :, None]\n",
+    "image = np.concatenate([image, image, image], axis=2)\n",
+    "image = Image.fromarray(image)\n",
+    "\n",
+    "images = pipe(\n",
+    "    prompt, negative_prompt=negative_prompt, image=image, controlnet_conditioning_scale=controlnet_conditioning_scale,\n",
+    "    ).images\n",
+    "\n",
+    "images[0].save(f\"controlnet.png\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/modelscope/__init__.py b/modelscope/__init__.py
index 97abdbd3d..c969be684 100644
--- a/modelscope/__init__.py
+++ b/modelscope/__init__.py
@@ -1,15 +1,17 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
-from modelscope.utils.import_utils import LazyImportModule
-from .utils.automodel_utils import fix_transformers_upgrade
+from modelscope.utils.import_utils import (LazyImportModule,
+                                           is_transformers_available)
 
 if TYPE_CHECKING:
     from .exporters import Exporter, TfModelExporter, TorchModelExporter
     from .hub.api import HubApi
     from .hub.check_model import check_local_model_is_latest, check_model_is_id
     from .hub.push_to_hub import push_to_hub, push_to_hub_async
-    from .hub.snapshot_download import snapshot_download
+    from .hub.snapshot_download import snapshot_download, dataset_snapshot_download
+    from .hub.file_download import model_file_download, dataset_file_download
+
     from .metrics import (
         AccuracyMetric, AudioNoiseMetric, BleuMetric, ImageColorEnhanceMetric,
         ImageColorizationMetric, ImageDenoiseMetric, ImageInpaintingMetric,
@@ -29,13 +31,34 @@
     from .trainers import (EpochBasedTrainer, Hook, Priority, TrainingArgs,
                            build_dataset_from_file)
     from .utils.constant import Tasks
-    from .utils.hf_util import AutoConfig, GPTQConfig, BitsAndBytesConfig
-    from .utils.hf_util import (AutoModel, AutoModelForCausalLM,
-                                AutoModelForSeq2SeqLM,
-                                AutoModelForSequenceClassification,
-                                AutoModelForTokenClassification, AutoTokenizer,
-                                GenerationConfig, AutoImageProcessor,
-                                BatchFeature)
+    if is_transformers_available():
+        from .utils.hf_util import (
+            AutoModel, AutoProcessor, AutoFeatureExtractor, GenerationConfig,
+            AutoConfig, GPTQConfig, AwqConfig, BitsAndBytesConfig,
+            AutoModelForCausalLM, AutoModelForSeq2SeqLM,
+            AutoModelForVision2Seq, AutoModelForSequenceClassification,
+            AutoModelForTokenClassification, AutoModelForImageClassification,
+            AutoModelForImageTextToText,
+            AutoModelForZeroShotImageClassification,
+            AutoModelForKeypointDetection,
+            AutoModelForDocumentQuestionAnswering,
+            AutoModelForSemanticSegmentation,
+            AutoModelForUniversalSegmentation,
+            AutoModelForInstanceSegmentation, AutoModelForObjectDetection,
+            AutoModelForZeroShotObjectDetection,
+            AutoModelForAudioClassification, AutoModelForSpeechSeq2Seq,
+            AutoModelForMaskedImageModeling,
+            AutoModelForVisualQuestionAnswering,
+            AutoModelForTableQuestionAnswering, AutoModelForImageToImage,
+            AutoModelForImageSegmentation, AutoModelForQuestionAnswering,
+            AutoModelForMaskedLM, AutoTokenizer, AutoModelForMaskGeneration,
+            AutoModelForPreTraining, AutoModelForTextEncoding,
+            AutoImageProcessor, BatchFeature, Qwen2VLForConditionalGeneration,
+            T5EncoderModel)
+    else:
+        print(
+            'transformer is not installed, please install it if you want to use related modules'
+        )
     from .utils.hub import create_model_if_not_exist, read_config
     from .utils.logger import get_logger
     from .version import __release_datetime__, __version__
@@ -53,7 +76,9 @@
             'TorchModelExporter',
         ],
         'hub.api': ['HubApi'],
-        'hub.snapshot_download': ['snapshot_download'],
+        'hub.snapshot_download':
+        ['snapshot_download', 'dataset_snapshot_download'],
+        'hub.file_download': ['model_file_download', 'dataset_file_download'],
         'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'],
         'hub.check_model':
         ['check_model_is_id', 'check_local_model_is_latest'],
@@ -78,17 +103,37 @@
         'utils.hub': ['read_config', 'create_model_if_not_exist'],
         'utils.logger': ['get_logger'],
         'utils.constant': ['Tasks'],
-        'utils.hf_util': [
-            'AutoConfig', 'GenerationConfig', 'AutoModel', 'GPTQConfig',
-            'BitsAndBytesConfig', 'AutoModelForCausalLM',
-            'AutoModelForSeq2SeqLM', 'AutoTokenizer',
-            'AutoModelForSequenceClassification',
-            'AutoModelForTokenClassification', 'AutoImageProcessor',
-            'BatchFeature'
-        ],
         'msdatasets': ['MsDataset']
     }
 
+    if is_transformers_available():
+        _import_structure['utils.hf_util'] = [
+            'AutoModel', 'AutoProcessor', 'AutoFeatureExtractor',
+            'GenerationConfig', 'AutoConfig', 'GPTQConfig', 'AwqConfig',
+            'BitsAndBytesConfig', 'AutoModelForCausalLM',
+            'AutoModelForSeq2SeqLM', 'AutoModelForVision2Seq',
+            'AutoModelForSequenceClassification',
+            'AutoModelForTokenClassification',
+            'AutoModelForImageClassification', 'AutoModelForImageToImage',
+            'AutoModelForImageTextToText',
+            'AutoModelForZeroShotImageClassification',
+            'AutoModelForKeypointDetection',
+            'AutoModelForDocumentQuestionAnswering',
+            'AutoModelForSemanticSegmentation',
+            'AutoModelForUniversalSegmentation',
+            'AutoModelForInstanceSegmentation', 'AutoModelForObjectDetection',
+            'AutoModelForZeroShotObjectDetection',
+            'AutoModelForAudioClassification', 'AutoModelForSpeechSeq2Seq',
+            'AutoModelForMaskedImageModeling',
+            'AutoModelForVisualQuestionAnswering',
+            'AutoModelForTableQuestionAnswering',
+            'AutoModelForImageSegmentation', 'AutoModelForQuestionAnswering',
+            'AutoModelForMaskedLM', 'AutoTokenizer',
+            'AutoModelForMaskGeneration', 'AutoModelForPreTraining',
+            'AutoModelForTextEncoding', 'AutoImageProcessor', 'BatchFeature',
+            'Qwen2VLForConditionalGeneration', 'T5EncoderModel'
+        ]
+
     import sys
 
     sys.modules[__name__] = LazyImportModule(
@@ -98,5 +143,3 @@
         module_spec=__spec__,
         extra_objects={},
     )
-
-fix_transformers_upgrade()
diff --git a/modelscope/cli/clearcache.py b/modelscope/cli/clearcache.py
new file mode 100644
index 000000000..dcd3d1dfe
--- /dev/null
+++ b/modelscope/cli/clearcache.py
@@ -0,0 +1,113 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+from argparse import ArgumentParser
+from pathlib import Path
+
+from modelscope.cli.base import CLICommand
+from modelscope.hub.constants import TEMPORARY_FOLDER_NAME
+from modelscope.hub.utils.utils import get_model_masked_directory
+
+
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return ClearCacheCMD(args)
+
+
+class ClearCacheCMD(CLICommand):
+    name = 'clear-cache'
+
+    def __init__(self, args):
+        self.args = args
+        self.cache_dir = os.getenv(
+            'MODELSCOPE_CACHE',
+            Path.home().joinpath('.cache', 'modelscope'))
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for clear-cache command.
+        """
+        parser = parsers.add_parser(ClearCacheCMD.name)
+        group = parser.add_mutually_exclusive_group()
+        group.add_argument(
+            '--model',
+            type=str,
+            help=
+            'The id of the model whose cache will be cleared. For clear-cache, '
+            'if neither model or dataset id is provided, entire cache will be cleared.'
+        )
+        group.add_argument(
+            '--dataset',
+            type=str,
+            help=
+            'The id of the dataset whose cache will be cleared. For clear-cache, '
+            'if neither model or dataset id is provided, entire cache will be cleared.'
+        )
+
+        parser.set_defaults(func=subparser_func)
+
+    def execute(self):
+        self._execute_with_confirmation()
+
+    def _execute_with_confirmation(self):
+        all = False
+        single_model = False
+        prompt = '\nYou are about to delete '
+
+        if self.args.model or self.args.dataset:
+            if self.args.model:
+                id = self.args.model
+                single_model = True
+                prompt = prompt + f'local cache for model {id}. '
+            else:
+                id = self.args.dataset
+                prompt = prompt + f'local cache for dataset {id}. '
+        else:
+            prompt = prompt + f'entire ModelScope cache at {self.cache_dir}, including ALL models and dataset.\n'
+            all = True
+        user_input = input(
+            prompt
+            + '\nPlease press Y or y to proceed, any other key to abort.\n'
+        ).strip().upper()
+
+        if user_input == 'Y':
+            if all:
+                self._remove_directory(self.cache_dir)
+                print('Cache cleared.')
+            else:
+                entity_directory = os.path.join(
+                    self.cache_dir, 'hub' if single_model else 'datasets', id)
+                temp_directory = os.path.join(
+                    self.cache_dir, 'hub' if single_model else 'datasets',
+                    TEMPORARY_FOLDER_NAME, id)
+                entity_removed = self._remove_directory(entity_directory)
+                temp_removed = self._remove_directory(temp_directory)
+                if (not entity_removed) and (not temp_removed):
+                    if single_model:
+                        print(
+                            f'Cache for Model {id} not found. Nothing to do.')
+                    else:
+                        print(
+                            f'Cache for Dataset {id} not found. Nothing to do.'
+                        )
+                else:
+                    print('Cache cleared.')
+        else:
+            print('Operation aborted.')
+            return
+
+    def _remove_directory(self, path):
+        if os.path.exists(path):
+            try:
+                if os.path.islink(path):
+                    shutil.rmtree(os.readlink(path))
+                    os.remove(path)
+                    print(f'Cache and link for {path} removed.')
+                else:
+                    shutil.rmtree(path)
+                    print(f'Cache folder {path} removed.')
+                return True
+            except Exception as e:
+                print(f'An error occurred while clearing cache at {path}: {e}')
+            return False
diff --git a/modelscope/cli/cli.py b/modelscope/cli/cli.py
index a25502fde..24fcc134f 100644
--- a/modelscope/cli/cli.py
+++ b/modelscope/cli/cli.py
@@ -1,29 +1,46 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import argparse
+import logging
 
+from modelscope.cli.clearcache import ClearCacheCMD
 from modelscope.cli.download import DownloadCMD
+from modelscope.cli.llamafile import LlamafileCMD
+from modelscope.cli.login import LoginCMD
 from modelscope.cli.modelcard import ModelCardCMD
 from modelscope.cli.pipeline import PipelineCMD
 from modelscope.cli.plugins import PluginsCMD
+from modelscope.cli.server import ServerCMD
+from modelscope.hub.api import HubApi
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(log_level=logging.WARNING)
 
 
 def run_cmd():
     parser = argparse.ArgumentParser(
         'ModelScope Command Line tool', usage='modelscope <command> [<args>]')
+    parser.add_argument(
+        '--token', default=None, help='Specify ModelScope SDK token.')
     subparsers = parser.add_subparsers(help='modelscope commands helpers')
 
     DownloadCMD.define_args(subparsers)
+    ClearCacheCMD.define_args(subparsers)
     PluginsCMD.define_args(subparsers)
     PipelineCMD.define_args(subparsers)
     ModelCardCMD.define_args(subparsers)
+    ServerCMD.define_args(subparsers)
+    LoginCMD.define_args(subparsers)
+    LlamafileCMD.define_args(subparsers)
 
     args = parser.parse_args()
 
     if not hasattr(args, 'func'):
         parser.print_help()
         exit(1)
-
+    if args.token is not None:
+        api = HubApi()
+        api.login(args.token)
     cmd = args.func(args)
     cmd.execute()
 
diff --git a/modelscope/cli/download.py b/modelscope/cli/download.py
index e6d316a29..aa23a3019 100644
--- a/modelscope/cli/download.py
+++ b/modelscope/cli/download.py
@@ -3,7 +3,10 @@
 from argparse import ArgumentParser
 
 from modelscope.cli.base import CLICommand
-from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.hub.file_download import (dataset_file_download,
+                                          model_file_download)
+from modelscope.hub.snapshot_download import (dataset_snapshot_download,
+                                              snapshot_download)
 
 
 def subparser_func(args):
@@ -22,9 +25,32 @@ def __init__(self, args):
     def define_args(parsers: ArgumentParser):
         """ define args for download command.
         """
-        parser = parsers.add_parser(DownloadCMD.name)
+        parser: ArgumentParser = parsers.add_parser(DownloadCMD.name)
+        group = parser.add_mutually_exclusive_group()
+        group.add_argument(
+            '--model',
+            type=str,
+            help='The id of the model to be downloaded. For download, '
+            'the id of either a model or dataset must be provided.')
+        group.add_argument(
+            '--dataset',
+            type=str,
+            help='The id of the dataset to be downloaded. For download, '
+            'the id of either a model or dataset must be provided.')
+        parser.add_argument(
+            'repo_id',
+            type=str,
+            nargs='?',
+            default=None,
+            help='Optional, '
+            'ID of the repo to download, It can also be set by --model or --dataset.'
+        )
         parser.add_argument(
-            'model', type=str, help='Name of the model to be downloaded.')
+            '--repo-type',
+            choices=['model', 'dataset'],
+            default='model',
+            help="Type of repo to download from (defaults to 'model').",
+        )
         parser.add_argument(
             '--revision',
             type=str,
@@ -35,10 +61,105 @@ def define_args(parsers: ArgumentParser):
             type=str,
             default=None,
             help='Cache directory to save model.')
+        parser.add_argument(
+            '--local_dir',
+            type=str,
+            default=None,
+            help='File will be downloaded to local location specified by'
+            'local_dir, in this case, cache_dir parameter will be ignored.')
+        parser.add_argument(
+            'files',
+            type=str,
+            default=None,
+            nargs='*',
+            help='Specify relative path to the repository file(s) to download.'
+            "(e.g 'tokenizer.json', 'onnx/decoder_model.onnx').")
+        parser.add_argument(
+            '--include',
+            nargs='*',
+            default=None,
+            type=str,
+            help='Glob patterns to match files to download.'
+            'Ignored if file is specified')
+        parser.add_argument(
+            '--exclude',
+            nargs='*',
+            type=str,
+            default=None,
+            help='Glob patterns to exclude from files to download.'
+            'Ignored if file is specified')
         parser.set_defaults(func=subparser_func)
 
     def execute(self):
-        snapshot_download(
-            self.args.model,
-            cache_dir=self.args.cache_dir,
-            revision=self.args.revision)
+        if self.args.model or self.args.dataset:
+            # the position argument of files will be put to repo_id.
+            if self.args.repo_id is not None:
+                if self.args.files:
+                    self.args.files.insert(0, self.args.repo_id)
+                else:
+                    self.args.files = [self.args.repo_id]
+        else:
+            if self.args.repo_id is not None:
+                if self.args.repo_type == 'model':
+                    self.args.model = self.args.repo_id
+                elif self.args.repo_type == 'dataset':
+                    self.args.dataset = self.args.repo_id
+                else:
+                    raise Exception('Not support repo-type: %s'
+                                    % self.args.repo_type)
+        if not self.args.model and not self.args.dataset:
+            raise Exception('Model or dataset must be set.')
+        if self.args.model:
+            if len(self.args.files) == 1:  # download single file
+                model_file_download(
+                    self.args.model,
+                    self.args.files[0],
+                    cache_dir=self.args.cache_dir,
+                    local_dir=self.args.local_dir,
+                    revision=self.args.revision)
+            elif len(
+                    self.args.files) > 1:  # download specified multiple files.
+                snapshot_download(
+                    self.args.model,
+                    revision=self.args.revision,
+                    cache_dir=self.args.cache_dir,
+                    local_dir=self.args.local_dir,
+                    allow_file_pattern=self.args.files,
+                )
+            else:  # download repo
+                snapshot_download(
+                    self.args.model,
+                    revision=self.args.revision,
+                    cache_dir=self.args.cache_dir,
+                    local_dir=self.args.local_dir,
+                    allow_file_pattern=self.args.include,
+                    ignore_file_pattern=self.args.exclude,
+                )
+        elif self.args.dataset:
+            if len(self.args.files) == 1:  # download single file
+                dataset_file_download(
+                    self.args.dataset,
+                    self.args.files[0],
+                    cache_dir=self.args.cache_dir,
+                    local_dir=self.args.local_dir,
+                    revision=self.args.revision)
+            elif len(
+                    self.args.files) > 1:  # download specified multiple files.
+                dataset_snapshot_download(
+                    self.args.dataset,
+                    revision=self.args.revision,
+                    cache_dir=self.args.cache_dir,
+                    local_dir=self.args.local_dir,
+                    allow_file_pattern=self.args.files,
+                )
+            else:  # download repo
+                dataset_snapshot_download(
+                    self.args.dataset,
+                    revision=self.args.revision,
+                    cache_dir=self.args.cache_dir,
+                    local_dir=self.args.local_dir,
+                    allow_file_pattern=self.args.include,
+                    ignore_file_pattern=self.args.exclude,
+                )
+        else:
+            pass  # noop
diff --git a/modelscope/cli/llamafile.py b/modelscope/cli/llamafile.py
new file mode 100644
index 000000000..23f3fe914
--- /dev/null
+++ b/modelscope/cli/llamafile.py
@@ -0,0 +1,158 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import os
+import sys
+from argparse import ArgumentParser
+
+from modelscope import model_file_download
+from modelscope.cli.base import CLICommand
+from modelscope.hub.api import HubApi
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(log_level=logging.WARNING)
+
+
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return LlamafileCMD(args)
+
+
+class LlamafileCMD(CLICommand):
+    name = 'llamafile'
+
+    def __init__(self, args):
+        self.args = args
+        self.model_id = self.args.model
+        if self.model_id is None or self.model_id.count('/') != 1:
+            raise ValueError(f'Invalid model id [{self.model_id}].')
+        if self.args.file is not None:
+            # ignore accuracy if file argument is provided
+            self.args.accuracy = None
+            if not self.args.file.lower().endswith('.llamafile'):
+                raise ValueError('file argument must ends with ".llamafile".')
+        self.api = HubApi()
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for clear-cache command.
+        """
+        parser = parsers.add_parser(LlamafileCMD.name)
+        parser.add_argument(
+            '--model',
+            type=str,
+            required=True,
+            help=
+            'The id of the model, whose repo must contain at least one llamafile'
+        )
+        group = parser.add_mutually_exclusive_group()
+        group.add_argument(
+            '--accuracy',
+            type=str,
+            required=False,
+            default='q4_k_m',
+            help=
+            'Selected accuracy of GGUF files in the repo. Ignored when "file" is also provided.'
+        )
+
+        group.add_argument(
+            '--file',
+            type=str,
+            required=False,
+            help=
+            'The name of a specified llamafile in the model repo. This takes precedence over "accuracy".'
+        )
+
+        parser.add_argument(
+            '--local_dir',
+            type=str,
+            default=None,
+            help=
+            'Directory where the selected llamafile would will be downloaded to.'
+        )
+
+        group.add_argument(
+            '--launch',
+            type=str,
+            required=False,
+            default='True',
+            help=
+            'Whether to launch model with the downloaded llamafile, default to True.'
+        )
+
+        parser.set_defaults(func=subparser_func)
+
+    def execute(self):
+        if self.args.file:
+            self.args.accuracy = None
+
+        all_files = self.api.get_model_files(self.model_id, recursive=True)
+        llamafiles = []
+        for info in all_files:
+            file_path = info['Path']
+            if file_path and file_path.lower().endswith(
+                    '.llamafile') and '-of-' not in file_path.lower():
+                llamafiles.append(file_path)
+        if not llamafiles:
+            raise ValueError(
+                f'Cannot locate a valid llamafile in repo {self.model_id}.')
+        logger.info(
+            f'list of llamafiles in repo {self.model_id}:\n{llamafiles}.')
+        # default choose the first llamafile if there is no q4_k_m, and no accuracy or file is specified
+        selected_file = llamafiles[0]
+        found = False
+        for f in llamafiles:
+            if self.args.file and f == self.args.file:
+                selected_file = f
+                found = True
+                break
+            if self.args.accuracy and self.args.accuracy.lower() in f.lower():
+                selected_file = f
+                found = True
+                break
+        if found:
+            print(f'llamafile matching criteria found: [{selected_file}].')
+        else:
+            print(
+                f'No matched llamafile found in repo, choosing the first llamafile in repo: [{selected_file}]'
+            )
+        downloaded_file = os.path.abspath(
+            model_file_download(
+                self.args.model, selected_file, local_dir=self.args.local_dir))
+
+        if sys.platform.startswith('win'):
+            downloaded_file = self._rename_extension(downloaded_file)
+
+        if self.args.launch.lower() == 'true':
+            print(f'Launching model with llamafile [{downloaded_file}]:')
+            self._execute_llamafile(downloaded_file)
+        else:
+            print(
+                f'No Launching. Llamafile model downloaded to [{downloaded_file}], you may execute it separately.'
+            )
+
+    def _execute_llamafile(self, file_path):
+        current_mode = os.stat(file_path).st_mode
+        new_mode = current_mode | 0o111
+        os.chmod(file_path, new_mode)
+        execute_cmd = file_path
+        has_gpu = False
+        try:
+            import torch
+            has_gpu = torch.cuda.is_available()
+        except ModuleNotFoundError:
+            # we depend on torch to detect gpu.
+            # if torch is not available, we will just assume gpu cannot be used
+            pass
+        if has_gpu:
+            print(
+                'GPU detected, launching model with llamafile GPU option >>>')
+            execute_cmd = f'{execute_cmd} -ngl 999'
+        os.system(execute_cmd)
+
+    def _rename_extension(self, original_file_name):
+        directory, filename = os.path.split(original_file_name)
+        base_name, _ = os.path.splitext(filename)
+        new_filename = os.path.join(directory, f'{base_name}.exe')
+        os.rename(original_file_name, new_filename)
+        return new_filename
diff --git a/modelscope/cli/login.py b/modelscope/cli/login.py
new file mode 100644
index 000000000..613b3205a
--- /dev/null
+++ b/modelscope/cli/login.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+from argparse import ArgumentParser
+
+from modelscope.cli.base import CLICommand
+from modelscope.hub.api import HubApi
+
+
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return LoginCMD(args)
+
+
+class LoginCMD(CLICommand):
+    name = 'login'
+
+    def __init__(self, args):
+        self.args = args
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for login command.
+        """
+        parser = parsers.add_parser(LoginCMD.name)
+        parser.add_argument(
+            '--token',
+            type=str,
+            required=True,
+            help='The Access Token for modelscope.')
+        parser.set_defaults(func=subparser_func)
+
+    def execute(self):
+        api = HubApi()
+        api.login(self.args.token)
diff --git a/modelscope/cli/modelcard.py b/modelscope/cli/modelcard.py
index 5e2b65803..646cf1b0f 100644
--- a/modelscope/cli/modelcard.py
+++ b/modelscope/cli/modelcard.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
 import os
 import shutil
 import tempfile
@@ -11,7 +12,7 @@
 from modelscope.hub.utils.utils import get_endpoint
 from modelscope.utils.logger import get_logger
 
-logger = get_logger()
+logger = get_logger(log_level=logging.WARNING)
 
 current_path = os.path.dirname(os.path.abspath(__file__))
 template_path = os.path.join(current_path, 'template')
@@ -29,7 +30,8 @@ class ModelCardCMD(CLICommand):
     def __init__(self, args):
         self.args = args
         self.api = HubApi()
-        self.api.login(args.access_token)
+        if args.access_token:
+            self.api.login(args.access_token)
         self.model_id = os.path.join(
             self.args.group_id, self.args.model_id
         ) if '/' not in self.args.model_id else self.args.model_id
@@ -39,12 +41,12 @@ def __init__(self, args):
     def define_args(parsers: ArgumentParser):
         """ define args for create or upload modelcard command.
         """
-        parser = parsers.add_parser(ModelCardCMD.name)
+        parser = parsers.add_parser(ModelCardCMD.name, aliases=['model'])
         parser.add_argument(
             '-tk',
             '--access_token',
             type=str,
-            required=True,
+            required=False,
             help='the certification of visit ModelScope')
         parser.add_argument(
             '-act',
@@ -70,13 +72,15 @@ def define_args(parsers: ArgumentParser):
             '--visibility',
             type=int,
             default=5,
-            help='the visibility of ModelScope')
+            help=
+            'the visibility of ModelScope[PRIVATE: 1, INTERNAL:3, PUBLIC:5]')
         parser.add_argument(
             '-lic',
             '--license',
             type=str,
             default='Apache License 2.0',
-            help='the license of visit ModelScope')
+            help='the license of visit ModelScope[Apache License 2.0|'
+            'GPL-2.0|GPL-3.0|LGPL-2.1|LGPL-3.0|AFL-3.0|ECL-2.0|MIT]')
         parser.add_argument(
             '-ch',
             '--chinese_name',
diff --git a/modelscope/cli/pipeline.py b/modelscope/cli/pipeline.py
index 793632e05..2b6f7951a 100644
--- a/modelscope/cli/pipeline.py
+++ b/modelscope/cli/pipeline.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
 import os
 from argparse import ArgumentParser
 from string import Template
@@ -6,7 +7,7 @@
 from modelscope.cli.base import CLICommand
 from modelscope.utils.logger import get_logger
 
-logger = get_logger()
+logger = get_logger(log_level=logging.WARNING)
 
 current_path = os.path.dirname(os.path.abspath(__file__))
 template_path = os.path.join(current_path, 'template')
diff --git a/modelscope/cli/server.py b/modelscope/cli/server.py
new file mode 100644
index 000000000..17d6ca4d0
--- /dev/null
+++ b/modelscope/cli/server.py
@@ -0,0 +1,38 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import logging
+import os
+from argparse import ArgumentParser
+from string import Template
+
+from modelscope.cli.base import CLICommand
+from modelscope.server.api_server import add_server_args, run_server
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(log_level=logging.WARNING)
+
+current_path = os.path.dirname(os.path.abspath(__file__))
+template_path = os.path.join(current_path, 'template')
+
+
+def subparser_func(args):
+    """ Function which will be called for a specific sub parser.
+    """
+    return ServerCMD(args)
+
+
+class ServerCMD(CLICommand):
+    name = 'server'
+
+    def __init__(self, args):
+        self.args = args
+
+    @staticmethod
+    def define_args(parsers: ArgumentParser):
+        """ define args for create pipeline template command.
+        """
+        parser = parsers.add_parser(ServerCMD.name)
+        add_server_args(parser)
+        parser.set_defaults(func=subparser_func)
+
+    def execute(self):
+        run_server(self.args)
diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py
index 7fc094ac7..871ed3366 100644
--- a/modelscope/exporters/__init__.py
+++ b/modelscope/exporters/__init__.py
@@ -8,7 +8,7 @@
     from .base import Exporter
     from .builder import build_exporter
     from .cv import CartoonTranslationExporter, FaceDetectionSCRFDExporter
-    from .multi_modal import StableDiffuisonExporter
+    from .multi_modal import StableDiffusionExporter
     from .nlp import (CsanmtForTranslationExporter,
                       SbertForSequenceClassificationExporter,
                       SbertForZeroShotClassificationExporter)
@@ -19,7 +19,7 @@
         'base': ['Exporter'],
         'builder': ['build_exporter'],
         'cv': ['CartoonTranslationExporter', 'FaceDetectionSCRFDExporter'],
-        'multi_modal': ['StableDiffuisonExporter'],
+        'multi_modal': ['StableDiffusionExporter'],
         'nlp': [
             'CsanmtForTranslationExporter',
             'SbertForSequenceClassificationExporter',
diff --git a/modelscope/exporters/multi_modal/__init__.py b/modelscope/exporters/multi_modal/__init__.py
index ab565d1ca..f19b04f1c 100644
--- a/modelscope/exporters/multi_modal/__init__.py
+++ b/modelscope/exporters/multi_modal/__init__.py
@@ -5,10 +5,10 @@
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .stable_diffusion_export import StableDiffuisonExporter
+    from .stable_diffusion_export import StableDiffusionExporter
 else:
     _import_structure = {
-        'stable_diffusion_export': ['StableDiffuisonExporter'],
+        'stable_diffusion_export': ['StableDiffusionExporter'],
     }
 
     import sys
diff --git a/modelscope/exporters/multi_modal/stable_diffusion_exporter.py b/modelscope/exporters/multi_modal/stable_diffusion_exporter.py
index 62ab0ce54..2c4319867 100644
--- a/modelscope/exporters/multi_modal/stable_diffusion_exporter.py
+++ b/modelscope/exporters/multi_modal/stable_diffusion_exporter.py
@@ -23,7 +23,7 @@
 
 @EXPORTERS.register_module(
     Tasks.text_to_image_synthesis, module_name=Models.stable_diffusion)
-class StableDiffuisonExporter(TorchModelExporter):
+class StableDiffusionExporter(TorchModelExporter):
 
     @torch.no_grad()
     def export_onnx(self,
diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py
index 385cd02c5..986affb27 100644
--- a/modelscope/fileio/__init__.py
+++ b/modelscope/fileio/__init__.py
@@ -1,4 +1,22 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from .file import File, LocalStorage
-from .io import dump, dumps, load
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .file import File, LocalStorage
+    from .io import dump, dumps, load
+else:
+    _import_structure = {
+        'io': ['dump', 'dumps', 'load'],
+        'file': ['File', 'LocalStorage']
+    }
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/fileio/format/json.py b/modelscope/fileio/format/json.py
index 660e342a2..cb26e58d6 100644
--- a/modelscope/fileio/format/json.py
+++ b/modelscope/fileio/format/json.py
@@ -1,11 +1,9 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-import numpy as np
-
-from . import jsonplus
 from .base import FormatHandler
 
 
 def set_default(obj):
+    import numpy as np
     """Set default json values for non-serializable values.
 
     It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
@@ -25,10 +23,13 @@ class JsonHandler(FormatHandler):
     """Use jsonplus, serialization of Python types to JSON that "just works"."""
 
     def load(self, file):
+        from . import jsonplus
         return jsonplus.loads(file.read())
 
     def dump(self, obj, file, **kwargs):
+        from . import jsonplus
         file.write(self.dumps(obj, **kwargs))
 
     def dumps(self, obj, **kwargs):
+        from . import jsonplus
         return jsonplus.dumps(obj, **kwargs)
diff --git a/modelscope/fileio/format/jsonplus.py b/modelscope/fileio/format/jsonplus.py
index af59caeb0..48a4d512a 100644
--- a/modelscope/fileio/format/jsonplus.py
+++ b/modelscope/fileio/format/jsonplus.py
@@ -9,7 +9,7 @@
 import threading
 import uuid
 from collections import namedtuple
-from datetime import date, datetime, time, timedelta
+from datetime import timedelta
 from dateutil.parser import parse as parse_datetime
 from decimal import Decimal
 from fractions import Fraction
diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py
index f83defd0e..a0d977125 100644
--- a/modelscope/hub/api.py
+++ b/modelscope/hub/api.py
@@ -15,13 +15,15 @@
 from http.cookiejar import CookieJar
 from os.path import expanduser
 from typing import Dict, List, Optional, Tuple, Union
+from urllib.parse import urlencode
 
-import pandas as pd
+import json
 import requests
 from requests import Session
 from requests.adapters import HTTPAdapter, Retry
 
-from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT,
+from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES,
+                                      API_HTTP_CLIENT_TIMEOUT,
                                       API_RESPONSE_FIELD_DATA,
                                       API_RESPONSE_FIELD_EMAIL,
                                       API_RESPONSE_FIELD_GIT_ACCESS_TOKEN,
@@ -31,7 +33,8 @@
                                       MODELSCOPE_CLOUD_ENVIRONMENT,
                                       MODELSCOPE_CLOUD_USERNAME,
                                       MODELSCOPE_REQUEST_ID, ONE_YEAR_SECONDS,
-                                      REQUESTS_API_HTTP_METHOD, Licenses,
+                                      REQUESTS_API_HTTP_METHOD,
+                                      DatasetVisibility, Licenses,
                                       ModelVisibility)
 from modelscope.hub.errors import (InvalidParameter, NotExistError,
                                    NotLoginException, NoValidRevisionError,
@@ -45,13 +48,14 @@
                                        DEFAULT_MODEL_REVISION,
                                        DEFAULT_REPOSITORY_REVISION,
                                        MASTER_MODEL_BRANCH, META_FILES_FORMAT,
+                                       REPO_TYPE_MODEL, ConfigFields,
                                        DatasetFormations, DatasetMetaFormats,
                                        DatasetVisibilityMap, DownloadChannel,
-                                       DownloadMode, ModelFile,
-                                       VirgoDatasetConfig)
+                                       DownloadMode, Frameworks, ModelFile,
+                                       Tasks, VirgoDatasetConfig)
 from modelscope.utils.logger import get_logger
-from .utils.utils import (get_endpoint, get_release_datetime,
-                          model_id_to_group_owner_name)
+from .utils.utils import (get_endpoint, get_readable_folder_size,
+                          get_release_datetime, model_id_to_group_owner_name)
 
 logger = get_logger()
 
@@ -59,7 +63,10 @@
 class HubApi:
     """Model hub api interface.
     """
-    def __init__(self, endpoint: Optional[str] = None):
+    def __init__(self,
+                 endpoint: Optional[str] = None,
+                 timeout=API_HTTP_CLIENT_TIMEOUT,
+                 max_retries=API_HTTP_CLIENT_MAX_RETRIES):
         """The ModelScope HubApi。
 
         Args:
@@ -69,11 +76,12 @@ def __init__(self, endpoint: Optional[str] = None):
         self.headers = {'user-agent': ModelScopeConfig.get_user_agent()}
         self.session = Session()
         retry = Retry(
-            total=2,
+            total=max_retries,
             read=2,
             connect=2,
             backoff_factor=1,
             status_forcelist=(500, 502, 503, 504),
+            respect_retry_after_header=False,
         )
         adapter = HTTPAdapter(max_retries=retry)
         self.session.mount('http://', adapter)
@@ -84,12 +92,12 @@ def __init__(self, endpoint: Optional[str] = None):
                 self.session, method,
                 functools.partial(
                     getattr(self.session, method),
-                    timeout=API_HTTP_CLIENT_TIMEOUT))
+                    timeout=timeout))
 
     def login(
         self,
         access_token: str,
-    ) -> tuple():
+    ):
         """Login with your SDK access token, which can be obtained from
            https://www.modelscope.cn user center.
 
@@ -241,6 +249,58 @@ def get_model(
         else:
             raise_for_http_status(r)
 
+    def repo_exists(
+        self,
+        repo_id: str,
+        *,
+        repo_type: Optional[str] = None,
+    ) -> bool:
+        """
+        Checks if a repository exists on ModelScope
+
+        Args:
+            repo_id (`str`):
+                A namespace (user or an organization) and a repo name separated
+                by a `/`.
+            repo_type (`str`, *optional*):
+                `None` or `"model"` if getting repository info from a model. Default is `None`.
+                TODO: support dataset and studio
+
+        Returns:
+            True if the repository exists, False otherwise.
+        """
+        if (repo_type is not None) and repo_type.lower != REPO_TYPE_MODEL:
+            raise Exception('Not support repo-type: %s' % repo_type)
+        if (repo_id is None) or repo_id.count('/') != 1:
+            raise Exception('Invalid repo_id: %s, must be of format namespace/name' % repo_type)
+
+        cookies = ModelScopeConfig.get_cookies()
+        owner_or_group, name = model_id_to_group_owner_name(repo_id)
+        path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}'
+
+        r = self.session.get(path, cookies=cookies,
+                             headers=self.builder_headers(self.headers))
+        code = handle_http_response(r, logger, cookies, repo_id, False)
+        if code == 200:
+            return True
+        elif code == 404:
+            return False
+        else:
+            logger.warn(f'Check repo_exists return status code {code}.')
+            raise Exception(
+                'Failed to check existence of repo: %s, make sure you have access authorization.'
+                % repo_type)
+
+    @staticmethod
+    def _create_default_config(model_dir):
+        cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
+        cfg = {
+            ConfigFields.framework: Frameworks.torch,
+            ConfigFields.task: Tasks.other,
+        }
+        with open(cfg_file, 'w') as file:
+            json.dump(cfg, file)
+
     def push_model(self,
                    model_id: str,
                    model_dir: str,
@@ -264,6 +324,8 @@ def push_model(self,
         This function must be called before calling HubApi's login with a valid token
         which can be obtained from ModelScope's website.
 
+        If any error, please upload via git commands.
+
         Args:
             model_id (str):
                 The model id to be uploaded, caller must have write permission for it.
@@ -306,23 +368,23 @@ def push_model(self,
             raise InvalidParameter('model_dir must be a valid directory.')
         cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
         if not os.path.exists(cfg_file):
-            raise ValueError(f'{model_dir} must contain a configuration.json.')
+            logger.warning(
+                f'No {ModelFile.CONFIGURATION} file found in {model_dir}, creating a default one.')
+            HubApi._create_default_config(model_dir)
+
         cookies = ModelScopeConfig.get_cookies()
         if cookies is None:
             raise NotLoginException('Must login before upload!')
         files_to_save = os.listdir(model_dir)
+        folder_size = get_readable_folder_size(model_dir)
         if ignore_file_pattern is None:
             ignore_file_pattern = []
         if isinstance(ignore_file_pattern, str):
             ignore_file_pattern = [ignore_file_pattern]
-        try:
-            self.get_model(model_id=model_id)
-        except Exception:
-            if visibility is None or license is None:
-                raise InvalidParameter(
-                    'visibility and license cannot be empty if want to create new repo'
-                )
-            logger.info('Create new model %s' % model_id)
+        if visibility is None or license is None:
+            raise InvalidParameter('Visibility and License cannot be empty for new model.')
+        if not self.repo_exists(model_id):
+            logger.info('Creating new model [%s]' % model_id)
             self.create_model(
                 model_id=model_id,
                 visibility=visibility,
@@ -331,11 +393,13 @@ def push_model(self,
                 original_model_id=original_model_id)
         tmp_dir = tempfile.mkdtemp()
         git_wrapper = GitCommandWrapper()
+        logger.info(f'Pushing folder {model_dir} as model {model_id}.')
+        logger.info(f'Total folder size {folder_size}, this may take a while depending on actual pushing size...')
         try:
             repo = Repository(model_dir=tmp_dir, clone_from=model_id)
             branches = git_wrapper.get_remote_branches(tmp_dir)
             if revision not in branches:
-                logger.info('Create new branch %s' % revision)
+                logger.info('Creating new branch %s' % revision)
                 git_wrapper.new_branch(tmp_dir, revision)
             git_wrapper.checkout(tmp_dir, revision)
             files_in_repo = os.listdir(tmp_dir)
@@ -399,7 +463,7 @@ def list_models(self,
             (owner_or_group, page_number, page_size),
             cookies=cookies,
             headers=self.builder_headers(self.headers))
-        handle_http_response(r, logger, cookies, 'list_model')
+        handle_http_response(r, logger, cookies, owner_or_group)
         if r.status_code == HTTPStatus.OK:
             if is_ok(r.json()):
                 data = r.json()[API_RESPONSE_FIELD_DATA]
@@ -429,6 +493,30 @@ def list_model_revisions(
             use_cookies: Union[bool, CookieJar] = False) -> List[str]:
         """Get model branch and tags.
 
+        Args:
+            model_id (str): The model id
+            cutoff_timestamp (int): Tags created before the cutoff will be included.
+                                    The timestamp is represented by the seconds elapsed from the epoch time.
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
+                        will load cookie from local. Defaults to False.
+
+        Returns:
+            Tuple[List[str], List[str]]: Return list of branch name and tags
+        """
+        tags_details = self.list_model_revisions_detail(model_id=model_id,
+                                                        cutoff_timestamp=cutoff_timestamp,
+                                                        use_cookies=use_cookies)
+        tags = [x['Revision'] for x in tags_details
+                ] if tags_details else []
+        return tags
+
+    def list_model_revisions_detail(
+            self,
+            model_id: str,
+            cutoff_timestamp: Optional[int] = None,
+            use_cookies: Union[bool, CookieJar] = False) -> List[str]:
+        """Get model branch and tags.
+
         Args:
             model_id (str): The model id
             cutoff_timestamp (int): Tags created before the cutoff will be included.
@@ -450,65 +538,89 @@ def list_model_revisions(
         raise_on_error(d)
         info = d[API_RESPONSE_FIELD_DATA]
         # tags returned from backend are guaranteed to be ordered by create-time
-        tags = [x['Revision'] for x in info['RevisionMap']['Tags']
-                ] if info['RevisionMap']['Tags'] else []
-        return tags
+        return info['RevisionMap']['Tags']
 
-    def get_valid_revision(self,
-                           model_id: str,
-                           revision=None,
-                           cookies: Optional[CookieJar] = None):
+    def get_branch_tag_detail(self, details, name):
+        for item in details:
+            if item['Revision'] == name:
+                return item
+        return None
+
+    def get_valid_revision_detail(self,
+                                  model_id: str,
+                                  revision=None,
+                                  cookies: Optional[CookieJar] = None):
         release_timestamp = get_release_datetime()
         current_timestamp = int(round(datetime.datetime.now().timestamp()))
         # for active development in library codes (non-release-branches), release_timestamp
         # is set to be a far-away-time-in-the-future, to ensure that we shall
         # get the master-HEAD version from model repo by default (when no revision is provided)
+        all_branches_detail, all_tags_detail = self.get_model_branches_and_tags_details(
+            model_id, use_cookies=False if cookies is None else cookies)
+        all_branches = [x['Revision'] for x in all_branches_detail] if all_branches_detail else []
+        all_tags = [x['Revision'] for x in all_tags_detail] if all_tags_detail else []
         if release_timestamp > current_timestamp + ONE_YEAR_SECONDS:
-            branches, tags = self.get_model_branches_and_tags(
-                model_id, use_cookies=False if cookies is None else cookies)
             if revision is None:
                 revision = MASTER_MODEL_BRANCH
                 logger.info(
                     'Model revision not specified, use default: %s in development mode'
                     % revision)
-            if revision not in branches and revision not in tags:
+            if revision not in all_branches and revision not in all_tags:
                 raise NotExistError('The model: %s has no revision : %s .' % (model_id, revision))
-            logger.info('Development mode use revision: %s' % revision)
+
+            revision_detail = self.get_branch_tag_detail(all_tags_detail, revision)
+            if revision_detail is None:
+                revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
+            logger.debug('Development mode use revision: %s' % revision)
         else:
-            all_revisions = self.list_model_revisions(
-                model_id,
-                cutoff_timestamp=current_timestamp,
-                use_cookies=False if cookies is None else cookies)
-            if len(all_revisions) == 0:
+            if revision is not None and revision in all_branches:
+                revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
+                logger.warning('Using branch: %s as version is unstable, use with caution' % revision)
+                return revision_detail
+
+            if len(all_tags_detail) == 0:  # use no revision use master as default.
                 if revision is None or revision == MASTER_MODEL_BRANCH:
                     revision = MASTER_MODEL_BRANCH
                 else:
                     raise NotExistError('The model: %s has no revision: %s !' % (model_id, revision))
+                revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
             else:
                 if revision is None:  # user not specified revision, use latest revision before release time
-                    revisions = self.list_model_revisions(
-                        model_id,
-                        cutoff_timestamp=release_timestamp,
-                        use_cookies=False if cookies is None else cookies)
-                    if len(revisions) > 0:
-                        revision = revisions[0]  # use latest revision before release time.
+                    revisions_detail = [x for x in
+                                        all_tags_detail if x['CreatedAt'] <= release_timestamp] if all_tags_detail else [] # noqa E501
+                    if len(revisions_detail) > 0:
+                        revision = revisions_detail[0]['Revision']  # use latest revision before release time.
+                        revision_detail = revisions_detail[0]
                     else:
-                        vl = '[%s]' % ','.join(all_revisions)
-                        raise NoValidRevisionError('Model revision should be specified from revisions: %s' % (vl))
+                        revision = MASTER_MODEL_BRANCH
+                        revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
+                        vl = '[%s]' % ','.join(all_tags)
+                        logger.warning('Model revision should be specified from revisions: %s' % (vl))
                     logger.warning('Model revision not specified, use revision: %s' % revision)
                 else:
                     # use user-specified revision
-                    if revision not in all_revisions:
+                    if revision not in all_tags:
                         if revision == MASTER_MODEL_BRANCH:
                             logger.warning('Using the master branch is fragile, please use it with caution!')
+                            revision_detail = self.get_branch_tag_detail(all_branches_detail, revision)
                         else:
-                            vl = '[%s]' % ','.join(all_revisions)
+                            vl = '[%s]' % ','.join(all_tags)
                             raise NotExistError('The model: %s has no revision: %s valid are: %s!' %
                                                 (model_id, revision, vl))
+                    else:
+                        revision_detail = self.get_branch_tag_detail(all_tags_detail, revision)
                     logger.info('Use user-specified model revision: %s' % revision)
-        return revision
+        return revision_detail
 
-    def get_model_branches_and_tags(
+    def get_valid_revision(self,
+                           model_id: str,
+                           revision=None,
+                           cookies: Optional[CookieJar] = None):
+        return self.get_valid_revision_detail(model_id=model_id,
+                                              revision=revision,
+                                              cookies=cookies)['Revision']
+
+    def get_model_branches_and_tags_details(
         self,
         model_id: str,
         use_cookies: Union[bool, CookieJar] = False,
@@ -532,10 +644,29 @@ def get_model_branches_and_tags(
         d = r.json()
         raise_on_error(d)
         info = d[API_RESPONSE_FIELD_DATA]
-        branches = [x['Revision'] for x in info['RevisionMap']['Branches']
-                    ] if info['RevisionMap']['Branches'] else []
-        tags = [x['Revision'] for x in info['RevisionMap']['Tags']
-                ] if info['RevisionMap']['Tags'] else []
+        return info['RevisionMap']['Branches'], info['RevisionMap']['Tags']
+
+    def get_model_branches_and_tags(
+        self,
+        model_id: str,
+        use_cookies: Union[bool, CookieJar] = False,
+    ) -> Tuple[List[str], List[str]]:
+        """Get model branch and tags.
+
+        Args:
+            model_id (str): The model id
+            use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True,
+                        will load cookie from local. Defaults to False.
+
+        Returns:
+            Tuple[List[str], List[str]]: Return list of branch name and tags
+        """
+        branches_detail, tags_detail = self.get_model_branches_and_tags_details(model_id=model_id,
+                                                                                use_cookies=use_cookies)
+        branches = [x['Revision'] for x in branches_detail
+                    ] if branches_detail else []
+        tags = [x['Revision'] for x in tags_detail
+                ] if tags_detail else []
         return branches, tags
 
     def get_model_files(self,
@@ -585,6 +716,64 @@ def get_model_files(self,
             files.append(file)
         return files
 
+    def file_exists(
+            self,
+            repo_id: str,
+            filename: str,
+            *,
+            revision: Optional[str] = None,
+    ):
+        """Get if the specified file exists
+
+        Args:
+            repo_id (`str`): The repo id to use
+            filename (`str`): The queried filename
+            revision (`Optional[str]`): The repo revision
+        Returns:
+            The query result in bool value
+        """
+        files = self.get_model_files(repo_id, revision=revision)
+        files = [file['Name'] for file in files]
+        return filename in files
+
+    def create_dataset(self,
+                       dataset_name: str,
+                       namespace: str,
+                       chinese_name: Optional[str] = '',
+                       license: Optional[str] = Licenses.APACHE_V2,
+                       visibility: Optional[int] = DatasetVisibility.PUBLIC,
+                       description: Optional[str] = '') -> str:
+
+        if dataset_name is None or namespace is None:
+            raise InvalidParameter('dataset_name and namespace are required!')
+
+        cookies = ModelScopeConfig.get_cookies()
+        if cookies is None:
+            raise ValueError('Token does not exist, please login first.')
+
+        path = f'{self.endpoint}/api/v1/datasets'
+        files = {
+            'Name': (None, dataset_name),
+            'ChineseName': (None, chinese_name),
+            'Owner': (None, namespace),
+            'License': (None, license),
+            'Visibility': (None, visibility),
+            'Description': (None, description)
+        }
+
+        r = self.session.post(
+            path,
+            files=files,
+            cookies=cookies,
+            headers=self.builder_headers(self.headers),
+        )
+
+        handle_http_post_error(r, path, files)
+        raise_on_error(r.json())
+        dataset_repo_url = f'{self.endpoint}/datasets/{namespace}/{dataset_name}'
+        logger.info(f'Create dataset success: {dataset_repo_url}')
+        return dataset_repo_url
+
     def list_datasets(self):
         path = f'{self.endpoint}/api/v1/datasets'
         params = {}
@@ -600,11 +789,56 @@ def get_dataset_id_and_type(self, dataset_name: str, namespace: str):
         cookies = ModelScopeConfig.get_cookies()
         r = self.session.get(datahub_url, cookies=cookies)
         resp = r.json()
-        datahub_raise_on_error(datahub_url, resp)
+        datahub_raise_on_error(datahub_url, resp, r)
         dataset_id = resp['Data']['Id']
         dataset_type = resp['Data']['Type']
         return dataset_id, dataset_type
 
+    def get_dataset_infos(self,
+                          dataset_hub_id: str,
+                          revision: str,
+                          files_metadata: bool = False,
+                          timeout: float = 100,
+                          recursive: str = 'True'):
+        """
+        Get dataset infos.
+        """
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
+        params = {'Revision': revision, 'Root': None, 'Recursive': recursive}
+        cookies = ModelScopeConfig.get_cookies()
+        if files_metadata:
+            params['blobs'] = True
+        r = self.session.get(datahub_url, params=params, cookies=cookies, timeout=timeout)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp, r)
+
+        return resp
+
+    def list_repo_tree(self,
+                       dataset_name: str,
+                       namespace: str,
+                       revision: str,
+                       root_path: str,
+                       recursive: bool = True,
+                       page_number: int = 1,
+                       page_size: int = 100):
+
+        dataset_hub_id, dataset_type = self.get_dataset_id_and_type(
+            dataset_name=dataset_name, namespace=namespace)
+
+        recursive = 'True' if recursive else 'False'
+        datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree'
+        params = {'Revision': revision if revision else 'master',
+                  'Root': root_path if root_path else '/', 'Recursive': recursive,
+                  'PageNumber': page_number, 'PageSize': page_size}
+        cookies = ModelScopeConfig.get_cookies()
+
+        r = self.session.get(datahub_url, params=params, cookies=cookies)
+        resp = r.json()
+        datahub_raise_on_error(datahub_url, resp, r)
+
+        return resp
+
     def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_id: str, revision: str):
         """ Get the meta file-list of the dataset. """
         datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
@@ -613,7 +847,7 @@ def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_
                              cookies=cookies,
                              headers=self.builder_headers(self.headers))
         resp = r.json()
-        datahub_raise_on_error(datahub_url, resp)
+        datahub_raise_on_error(datahub_url, resp, r)
         file_list = resp['Data']
         if file_list is None:
             raise NotExistError(
@@ -626,8 +860,10 @@ def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_
     @staticmethod
     def dump_datatype_file(dataset_type: int, meta_cache_dir: str):
         """
-        Dump the data_type as a local file, in order to get the dataset formation without calling the datahub.
-        More details, please refer to the class `modelscope.utils.constant.DatasetFormations`.
+        Dump the data_type as a local file, in order to get the dataset
+         formation without calling the datahub.
+        More details, please refer to the class
+        `modelscope.utils.constant.DatasetFormations`.
         """
         dataset_type_file_path = os.path.join(meta_cache_dir,
                                               f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}')
@@ -673,8 +909,9 @@ def fetch_meta_files_from_url(url, out_path, chunk_size=1024, mode=DownloadMode.
         Fetch the meta-data files from the url, e.g. csv/jsonl files.
         """
         import hashlib
-        import json
-        from tqdm import tqdm
+        from tqdm.auto import tqdm
+        import pandas as pd
+
         out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest())
         if mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(out_path):
             os.remove(out_path)
@@ -712,7 +949,7 @@ def get_chunk(resp):
                     else:
                         with_header = False
                     chunk_df = pd.DataFrame(chunk)
-                    chunk_df.to_csv(f, index=False, header=with_header)
+                    chunk_df.to_csv(f, index=False, header=with_header, escapechar='\\')
                     iter_num += 1
                 else:
                     # csv or others
@@ -723,6 +960,35 @@ def get_chunk(resp):
         return out_path
 
     def get_dataset_file_url(
+            self,
+            file_name: str,
+            dataset_name: str,
+            namespace: str,
+            revision: Optional[str] = DEFAULT_DATASET_REVISION,
+            view: Optional[bool] = False,
+            extension_filter: Optional[bool] = True):
+
+        if not file_name or not dataset_name or not namespace:
+            raise ValueError('Args (file_name, dataset_name, namespace) cannot be empty!')
+
+        # Note: make sure the FilePath is the last parameter in the url
+        params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': file_name, 'View': view}
+        params: str = urlencode(params)
+        file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?{params}'
+
+        return file_url
+
+        # if extension_filter:
+        #     if os.path.splitext(file_name)[-1] in META_FILES_FORMAT:
+        #         file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'\
+        #                    f'Revision={revision}&FilePath={file_name}'
+        #     else:
+        #         file_url = file_name
+        #     return file_url
+        # else:
+        #     return file_url
+
+    def get_dataset_file_url_origin(
             self,
             file_name: str,
             dataset_name: str,
@@ -866,10 +1132,10 @@ def datahub_remote_call(self, url):
             cookies=cookies,
             headers={'user-agent': ModelScopeConfig.get_user_agent()})
         resp = r.json()
-        datahub_raise_on_error(url, resp)
+        datahub_raise_on_error(url, resp, r)
         return resp['Data']
 
-    def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool) -> None:
+    def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool = False) -> None:
         is_ci_test = os.getenv('CI_TEST') == 'True'
         if dataset_name and namespace and not is_ci_test and not use_streaming:
             try:
@@ -902,6 +1168,10 @@ def builder_headers(self, headers):
         return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex),
                 **headers}
 
+    def get_file_base_path(self, namespace: str, dataset_name: str) -> str:
+        return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'
+        # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath='
+
 
 class ModelScopeConfig:
     path_credential = expanduser(DEFAULT_CREDENTIALS_PATH)
@@ -909,6 +1179,7 @@ class ModelScopeConfig:
     GIT_TOKEN_FILE_NAME = 'git_token'
     USER_INFO_FILE_NAME = 'user'
     USER_SESSION_ID_FILE_NAME = 'session'
+    cookie_expired_warning = False
 
     @staticmethod
     def make_sure_credential_path_exist():
@@ -930,10 +1201,12 @@ def get_cookies():
             with open(cookies_path, 'rb') as f:
                 cookies = pickle.load(f)
                 for cookie in cookies:
-                    if cookie.is_expired():
-                        logger.warning(
+                    if cookie.is_expired() and not ModelScopeConfig.cookie_expired_warning:
+                        ModelScopeConfig.cookie_expired_warning = True
+                        logger.debug(
                             'Authentication has expired, '
-                            'please re-login if you need to access private models or datasets.')
+                            'please re-login with modelscope login --token "YOUR_SDK_TOKEN" '
+                            'if you need to access private models or datasets.')
                         return None
                 return cookies
         return None
diff --git a/modelscope/hub/check_model.py b/modelscope/hub/check_model.py
index ab1500dbd..59a77bfee 100644
--- a/modelscope/hub/check_model.py
+++ b/modelscope/hub/check_model.py
@@ -48,7 +48,7 @@ def check_local_model_is_latest(
                 'Snapshot': 'True'
             }
         }
-        _api = HubApi()
+        _api = HubApi(timeout=0.5)
         try:
             _, revisions = _api.get_model_branches_and_tags(
                 model_id=model_id, use_cookies=cookies)
diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py
index 3ebc167d7..b3d03e1ae 100644
--- a/modelscope/hub/constants.py
+++ b/modelscope/hub/constants.py
@@ -17,10 +17,11 @@
 DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials')
 REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete']
 API_HTTP_CLIENT_TIMEOUT = 60
+API_HTTP_CLIENT_MAX_RETRIES = 2
 API_RESPONSE_FIELD_DATA = 'Data'
 API_FILE_DOWNLOAD_RETRY_TIMES = 5
-API_FILE_DOWNLOAD_TIMEOUT = 30
-API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 16
+API_FILE_DOWNLOAD_TIMEOUT = 60
+API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 1
 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken'
 API_RESPONSE_FIELD_USERNAME = 'Username'
 API_RESPONSE_FIELD_EMAIL = 'Email'
@@ -28,10 +29,10 @@
 MODELSCOPE_CLOUD_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT'
 MODELSCOPE_CLOUD_USERNAME = 'MODELSCOPE_USERNAME'
 MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG'
+MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION = 'MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION'
 ONE_YEAR_SECONDS = 24 * 365 * 60 * 60
-MODEL_META_FILE_NAME = '.mdl'
-MODEL_META_MODEL_ID = 'id'
 MODELSCOPE_REQUEST_ID = 'X-Request-ID'
+TEMPORARY_FOLDER_NAME = '._____temp'
 
 
 class Licenses(object):
@@ -49,3 +50,9 @@ class ModelVisibility(object):
     PRIVATE = 1
     INTERNAL = 3
     PUBLIC = 5
+
+
+class DatasetVisibility(object):
+    PRIVATE = 1
+    INTERNAL = 3
+    PUBLIC = 5
diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py
index 48bb5fe0c..986425d2d 100644
--- a/modelscope/hub/errors.py
+++ b/modelscope/hub/errors.py
@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import logging
 from http import HTTPStatus
+from typing import Optional
 
 import requests
 from requests.exceptions import HTTPError
@@ -8,7 +10,7 @@
 from modelscope.hub.constants import MODELSCOPE_REQUEST_ID
 from modelscope.utils.logger import get_logger
 
-logger = get_logger()
+logger = get_logger(log_level=logging.WARNING)
 
 
 class NotSupportError(Exception):
@@ -85,18 +87,41 @@ def handle_http_post_error(response, url, request_body):
             (url, request_body, message, get_request_id(response))) from error
 
 
-def handle_http_response(response: requests.Response, logger, cookies,
-                         model_id):
-    try:
-        response.raise_for_status()
-    except HTTPError as error:
-        if cookies is None:  # code in [403] and
-            logger.error(
-                f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
-                private. Please login first.')
-        message = _decode_response_error(response)
-        raise HTTPError('Response details: %s, Request id: %s' %
-                        (message, get_request_id(response))) from error
+def handle_http_response(response: requests.Response,
+                         logger,
+                         cookies,
+                         model_id,
+                         raise_on_error: Optional[bool] = True) -> int:
+    http_error_msg = ''
+    if isinstance(response.reason, bytes):
+        try:
+            reason = response.reason.decode('utf-8')
+        except UnicodeDecodeError:
+            reason = response.reason.decode('iso-8859-1')
+    else:
+        reason = response.reason
+    request_id = get_request_id(response)
+    if 404 == response.status_code:
+        http_error_msg = 'The request model: %s does not exist!' % (model_id)
+    elif 403 == response.status_code:
+        if cookies is None:
+            http_error_msg = 'Authentication token does not exist, '
+            'failed to access model {model_id} which may not exist or may be '
+            'private. Please login first.'
+        else:
+            http_error_msg = 'The authentication token is invalid, failed to access model {model_id}.'
+    elif 400 <= response.status_code < 500:
+        http_error_msg = u'%s Client Error: %s, Request id: %s for url: %s' % (
+            response.status_code, reason, request_id, response.url)
+
+    elif 500 <= response.status_code < 600:
+        http_error_msg = u'%s Server Error: %s, Request id: %s, for url: %s' % (
+            response.status_code, reason, request_id, response.url)
+    if http_error_msg and raise_on_error:  # there is error.
+        logger.error(http_error_msg)
+        raise HTTPError(http_error_msg, response=response)
+    else:
+        return response.status_code
 
 
 def raise_on_error(rsp):
@@ -117,12 +142,13 @@ def raise_on_error(rsp):
         raise RequestError(rsp['Message'])
 
 
-def datahub_raise_on_error(url, rsp):
+def datahub_raise_on_error(url, rsp, http_response: requests.Response):
     """If response error, raise exception
 
     Args:
         url (str): The request url
         rsp (HTTPResponse): The server response.
+        http_response: the origin http response.
 
     Raises:
         RequestError: the http request error.
@@ -133,9 +159,9 @@ def datahub_raise_on_error(url, rsp):
     if rsp.get('Code') == HTTPStatus.OK:
         return True
     else:
-        request_id = get_request_id(rsp)
+        request_id = rsp['RequestId']
         raise RequestError(
-            f"Url = {url}, Request id={request_id} Message = {rsp.get('Message')},\
+            f"Url = {url}, Request id={request_id} Code = {rsp['Code']} Message = {rsp['Message']},\
                 Please specify correct dataset_name and namespace.")
 
 
@@ -159,7 +185,12 @@ def raise_for_http_status(rsp):
     else:
         reason = rsp.reason
     request_id = get_request_id(rsp)
-    if 400 <= rsp.status_code < 500:
+    if 404 == rsp.status_code:
+        http_error_msg = 'The request resource(model or dataset) does not exist!,'
+        'url: %s, reason: %s' % (rsp.url, reason)
+    elif 403 == rsp.status_code:
+        http_error_msg = 'Authentication token does not exist or invalid.'
+    elif 400 <= rsp.status_code < 500:
         http_error_msg = u'%s Client Error: %s, Request id: %s for url: %s' % (
             rsp.status_code, reason, request_id, rsp.url)
 
diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py
index c37b716ad..40ac8a038 100644
--- a/modelscope/hub/file_download.py
+++ b/modelscope/hub/file_download.py
@@ -1,9 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import copy
+import io
 import os
 import tempfile
-import threading
+import urllib
 import uuid
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
@@ -13,19 +14,24 @@
 
 import requests
 from requests.adapters import Retry
-from tqdm import tqdm
+from tqdm.auto import tqdm
 
 from modelscope.hub.api import HubApi, ModelScopeConfig
 from modelscope.hub.constants import (
     API_FILE_DOWNLOAD_CHUNK_SIZE, API_FILE_DOWNLOAD_RETRY_TIMES,
     API_FILE_DOWNLOAD_TIMEOUT, FILE_HASH, MODELSCOPE_DOWNLOAD_PARALLELS,
-    MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB)
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+    MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB, TEMPORARY_FOLDER_NAME)
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+                                       DEFAULT_MODEL_REVISION,
+                                       REPO_TYPE_DATASET, REPO_TYPE_MODEL,
+                                       REPO_TYPE_SUPPORT)
+from modelscope.utils.file_utils import (get_dataset_cache_root,
+                                         get_model_cache_root)
 from modelscope.utils.logger import get_logger
-from .errors import FileDownloadError, NotExistError
+from .errors import FileDownloadError, InvalidParameter, NotExistError
 from .utils.caching import ModelFileSystemCache
-from .utils.utils import (file_integrity_validation, get_cache_dir,
-                          get_endpoint, model_id_to_group_owner_name)
+from .utils.utils import (file_integrity_validation, get_endpoint,
+                          model_id_to_group_owner_name)
 
 logger = get_logger()
 
@@ -38,6 +44,7 @@ def model_file_download(
     user_agent: Union[Dict, str, None] = None,
     local_files_only: Optional[bool] = False,
     cookies: Optional[CookieJar] = None,
+    local_dir: Optional[str] = None,
 ) -> Optional[str]:  # pragma: no cover
     """Download from a given URL and cache it if it's not already present in the local cache.
 
@@ -55,6 +62,7 @@ def model_file_download(
         local_files_only (bool, optional):  If `True`, avoid downloading the file and return the path to the
             local cached file if it exists. if `False`, download the file anyway even it exists.
         cookies (CookieJar, optional): The cookie of download request.
+        local_dir (str, optional): Specific local directory path to which the file will be downloaded.
 
     Returns:
         string: string of local file or if networking is off, last version of
@@ -74,16 +82,97 @@ def model_file_download(
             - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
             if some parameter value is invalid
     """
-    if cache_dir is None:
-        cache_dir = get_cache_dir()
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    temporary_cache_dir = os.path.join(cache_dir, 'temp')
-    os.makedirs(temporary_cache_dir, exist_ok=True)
+    return _repo_file_download(
+        model_id,
+        file_path,
+        repo_type=REPO_TYPE_MODEL,
+        revision=revision,
+        cache_dir=cache_dir,
+        user_agent=user_agent,
+        local_files_only=local_files_only,
+        cookies=cookies,
+        local_dir=local_dir)
 
-    group_or_owner, name = model_id_to_group_owner_name(model_id)
 
-    cache = ModelFileSystemCache(cache_dir, group_or_owner, name)
+def dataset_file_download(
+    dataset_id: str,
+    file_path: str,
+    revision: Optional[str] = DEFAULT_DATASET_REVISION,
+    cache_dir: Union[str, Path, None] = None,
+    local_dir: Optional[str] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    local_files_only: Optional[bool] = False,
+    cookies: Optional[CookieJar] = None,
+) -> str:
+    """Download raw files of a dataset.
+    Downloads all files at the specified revision. This
+    is useful when you want all files from a dataset, because you don't know which
+    ones you will need a priori. All files are nested inside a folder in order
+    to keep their actual filename relative to that folder.
+
+    An alternative would be to just clone a dataset but this would require that the
+    user always has git and git-lfs installed, and properly configured.
+
+    Args:
+        dataset_id (str): A user or an organization name and a dataset name separated by a `/`.
+        file_path (str): The relative path of the file to download.
+        revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a
+            commit hash. NOTE: currently only branch and tag name is supported
+        cache_dir (str, Path, optional): Path to the folder where cached files are stored, dataset file will
+            be save as cache_dir/dataset_id/THE_DATASET_FILES.
+        local_dir (str, optional): Specific local directory path to which the file will be downloaded.
+        user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string.
+        local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the
+            local cached file if it exists.
+        cookies (CookieJar, optional): The cookie of the request, default None.
+    Raises:
+        ValueError: the value details.
+
+    Returns:
+        str: Local folder path (string) of repo snapshot
+
+    Note:
+        Raises the following errors:
+        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+        if `use_auth_token=True` and the token cannot be found.
+        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
+        ETag cannot be determined.
+        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+        if some parameter value is invalid
+    """
+    return _repo_file_download(
+        dataset_id,
+        file_path,
+        repo_type=REPO_TYPE_DATASET,
+        revision=revision,
+        cache_dir=cache_dir,
+        user_agent=user_agent,
+        local_files_only=local_files_only,
+        cookies=cookies,
+        local_dir=local_dir)
+
+
+def _repo_file_download(
+    repo_id: str,
+    file_path: str,
+    *,
+    repo_type: str = None,
+    revision: Optional[str] = DEFAULT_MODEL_REVISION,
+    cache_dir: Optional[str] = None,
+    user_agent: Union[Dict, str, None] = None,
+    local_files_only: Optional[bool] = False,
+    cookies: Optional[CookieJar] = None,
+    local_dir: Optional[str] = None,
+) -> Optional[str]:  # pragma: no cover
+
+    if not repo_type:
+        repo_type = REPO_TYPE_MODEL
+    if repo_type not in REPO_TYPE_SUPPORT:
+        raise InvalidParameter('Invalid repo type: %s, only support: %s' %
+                               (repo_type, REPO_TYPE_SUPPORT))
+
+    temporary_cache_dir, cache = create_temporary_directory_and_cache(
+        repo_id, local_dir=local_dir, cache_dir=cache_dir, repo_type=repo_type)
 
     # if local_files_only is `True` and the file already exists in cached_path
     # return the cached path
@@ -97,7 +186,7 @@ def model_file_download(
         else:
             raise ValueError(
                 'Cannot find the requested files in the cached path and outgoing'
-                ' traffic has been disabled. To enable model look-ups and downloads'
+                ' traffic has been disabled. To enable look-ups and downloads'
                 " online, set 'local_files_only' to False.")
 
     _api = HubApi()
@@ -106,64 +195,115 @@ def model_file_download(
     }
     if cookies is None:
         cookies = ModelScopeConfig.get_cookies()
+    repo_files = []
+    file_to_download_meta = None
+    if repo_type == REPO_TYPE_MODEL:
+        revision = _api.get_valid_revision(
+            repo_id, revision=revision, cookies=cookies)
+        # we need to confirm the version is up-to-date
+        # we need to get the file list to check if the latest version is cached, if so return, otherwise download
+        repo_files = _api.get_model_files(
+            model_id=repo_id,
+            revision=revision,
+            recursive=True,
+            use_cookies=False if cookies is None else cookies)
+        for repo_file in repo_files:
+            if repo_file['Type'] == 'tree':
+                continue
+
+            if repo_file['Path'] == file_path:
+                if cache.exists(repo_file):
+                    logger.debug(
+                        f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!'
+                    )
+                    return cache.get_file_by_info(repo_file)
+                else:
+                    file_to_download_meta = repo_file
+                break
+    elif repo_type == REPO_TYPE_DATASET:
+        group_or_owner, name = model_id_to_group_owner_name(repo_id)
+        if not revision:
+            revision = DEFAULT_DATASET_REVISION
+        page_number = 1
+        page_size = 100
+        while True:
+            files_list_tree = _api.list_repo_tree(
+                dataset_name=name,
+                namespace=group_or_owner,
+                revision=revision,
+                root_path='/',
+                recursive=True,
+                page_number=page_number,
+                page_size=page_size)
+            if not ('Code' in files_list_tree
+                    and files_list_tree['Code'] == 200):
+                print(
+                    'Get dataset: %s file list failed, request_id: %s, message: %s'
+                    % (repo_id, files_list_tree['RequestId'],
+                       files_list_tree['Message']))
+                return None
+            repo_files = files_list_tree['Data']['Files']
+            is_exist = False
+            for repo_file in repo_files:
+                if repo_file['Type'] == 'tree':
+                    continue
+
+                if repo_file['Path'] == file_path:
+                    if cache.exists(repo_file):
+                        logger.debug(
+                            f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!'
+                        )
+                        return cache.get_file_by_info(repo_file)
+                    else:
+                        file_to_download_meta = repo_file
+                        is_exist = True
+                    break
+            if len(repo_files) < page_size or is_exist:
+                break
+            page_number += 1
 
-    revision = _api.get_valid_revision(
-        model_id, revision=revision, cookies=cookies)
-    file_to_download_info = None
-    # we need to confirm the version is up-to-date
-    # we need to get the file list to check if the latest version is cached, if so return, otherwise download
-    model_files = _api.get_model_files(
-        model_id=model_id,
-        revision=revision,
-        recursive=True,
-        use_cookies=False if cookies is None else cookies)
-
-    for model_file in model_files:
-        if model_file['Type'] == 'tree':
-            continue
-
-        if model_file['Path'] == file_path:
-            if cache.exists(model_file):
-                logger.debug(
-                    f'File {model_file["Name"]} already in cache, skip downloading!'
-                )
-                return cache.get_file_by_info(model_file)
-            else:
-                file_to_download_info = model_file
-            break
-
-    if file_to_download_info is None:
+    if file_to_download_meta is None:
         raise NotExistError('The file path: %s not exist in: %s' %
-                            (file_path, model_id))
+                            (file_path, repo_id))
 
     # we need to download again
-    url_to_download = get_file_download_url(model_id, file_path, revision)
-    temp_file_name = next(tempfile._get_candidate_names())
+    if repo_type == REPO_TYPE_MODEL:
+        url_to_download = get_file_download_url(repo_id, file_path, revision)
+    elif repo_type == REPO_TYPE_DATASET:
+        url_to_download = _api.get_dataset_file_url(
+            file_name=file_to_download_meta['Path'],
+            dataset_name=name,
+            namespace=group_or_owner,
+            revision=revision)
+    return download_file(url_to_download, file_to_download_meta,
+                         temporary_cache_dir, cache, headers, cookies)
+
+
+def create_temporary_directory_and_cache(model_id: str,
+                                         local_dir: str = None,
+                                         cache_dir: str = None,
+                                         repo_type: str = REPO_TYPE_MODEL):
+    if repo_type == REPO_TYPE_MODEL:
+        default_cache_root = get_model_cache_root()
+    elif repo_type == REPO_TYPE_DATASET:
+        default_cache_root = get_dataset_cache_root()
 
-    if MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB * 1000 * 1000 < file_to_download_info[
-            'Size'] and MODELSCOPE_DOWNLOAD_PARALLELS > 1:
-        parallel_download(
-            url_to_download,
-            temporary_cache_dir,
-            temp_file_name,
-            headers=headers,
-            cookies=None if cookies is None else cookies.get_dict(),
-            file_size=file_to_download_info['Size'])
+    group_or_owner, name = model_id_to_group_owner_name(model_id)
+    if local_dir is not None:
+        temporary_cache_dir = os.path.join(local_dir, TEMPORARY_FOLDER_NAME)
+        cache = ModelFileSystemCache(local_dir)
     else:
-        http_get_file(
-            url_to_download,
-            temporary_cache_dir,
-            temp_file_name,
-            headers=headers,
-            cookies=None if cookies is None else cookies.get_dict())
+        if cache_dir is None:
+            cache_dir = default_cache_root
+        if isinstance(cache_dir, Path):
+            cache_dir = str(cache_dir)
+        temporary_cache_dir = os.path.join(cache_dir, TEMPORARY_FOLDER_NAME,
+                                           group_or_owner, name)
+        name = name.replace('.', '___')
+        cache = ModelFileSystemCache(cache_dir, group_or_owner, name)
 
-    temp_file_path = os.path.join(temporary_cache_dir, temp_file_name)
-    # for download with commit we can't get Sha256
-    if file_to_download_info[FILE_HASH] is not None:
-        file_integrity_validation(temp_file_path,
-                                  file_to_download_info[FILE_HASH])
-    return cache.put_file(file_to_download_info,
-                          os.path.join(temporary_cache_dir, temp_file_name))
+    os.makedirs(temporary_cache_dir, exist_ok=True)
+    return temporary_cache_dir, cache
 
 
 def get_file_download_url(model_id: str, file_path: str, revision: str):
@@ -179,6 +319,8 @@ def get_file_download_url(model_id: str, file_path: str, revision: str):
     Returns:
         str: The file url.
     """
+    file_path = urllib.parse.quote_plus(file_path)
+    revision = urllib.parse.quote_plus(revision)
     download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}'
     return download_url_template.format(
         endpoint=get_endpoint(),
@@ -190,18 +332,27 @@ def get_file_download_url(model_id: str, file_path: str, revision: str):
 
 def download_part_with_retry(params):
     # unpack parameters
-    progress, start, end, url, file_name, cookies, headers = params
+    model_file_path, progress, start, end, url, file_name, cookies, headers = params
     get_headers = {} if headers is None else copy.deepcopy(headers)
-    get_headers['Range'] = 'bytes=%s-%s' % (start, end)
     get_headers['X-Request-ID'] = str(uuid.uuid4().hex)
     retry = Retry(
         total=API_FILE_DOWNLOAD_RETRY_TIMES,
         backoff_factor=1,
         allowed_methods=['GET'])
+    part_file_name = model_file_path + '_%s_%s' % (start, end)
     while True:
         try:
-            with open(file_name, 'rb+') as f:
-                f.seek(start)
+            partial_length = 0
+            if os.path.exists(
+                    part_file_name):  # download partial, continue download
+                with open(part_file_name, 'rb') as f:
+                    partial_length = f.seek(0, io.SEEK_END)
+                    progress.update(partial_length)
+            download_start = start + partial_length
+            if download_start > end:
+                break  # this part is download completed.
+            get_headers['Range'] = 'bytes=%s-%s' % (download_start, end)
+            with open(part_file_name, 'ab+') as f:
                 r = requests.get(
                     url,
                     stream=True,
@@ -212,12 +363,12 @@ def download_part_with_retry(params):
                         chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE):
                     if chunk:  # filter out keep-alive new chunks
                         f.write(chunk)
-            progress.update(end - start)
+                        progress.update(len(chunk))
             break
         except (Exception) as e:  # no matter what exception, we will retry.
             retry = retry.increment('GET', url, error=e)
-            logger.warning('Download file from: %s to: %s failed, will retry' %
-                           (start, end))
+            logger.warning('Downloading: %s failed, reason: %s will retry' %
+                           (model_file_path, e))
             retry.sleep()
 
 
@@ -230,36 +381,130 @@ def parallel_download(
     file_size: int = None,
 ):
     # create temp file
-    temp_file_manager = partial(
-        tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False)
-    with temp_file_manager() as temp_file:
-        progress = tqdm(
+    with tqdm(
             unit='B',
             unit_scale=True,
             unit_divisor=1024,
             total=file_size,
             initial=0,
-            desc='Downloading',
-        )
+            desc='Downloading [' + file_name + ']',
+            leave=True,
+    ) as progress:
         PART_SIZE = 160 * 1024 * 1024  # every part is 160M
         tasks = []
+        file_path = os.path.join(local_dir, file_name)
+        os.makedirs(os.path.dirname(file_path), exist_ok=True)
         for idx in range(int(file_size / PART_SIZE)):
             start = idx * PART_SIZE
             end = (idx + 1) * PART_SIZE - 1
-            tasks.append(
-                (progress, start, end, url, temp_file.name, cookies, headers))
+            tasks.append((file_path, progress, start, end, url, file_name,
+                          cookies, headers))
         if end + 1 < file_size:
-            tasks.append((progress, end + 1, file_size - 1, url,
-                          temp_file.name, cookies, headers))
+            tasks.append((file_path, progress, end + 1, file_size - 1, url,
+                          file_name, cookies, headers))
         parallels = MODELSCOPE_DOWNLOAD_PARALLELS if MODELSCOPE_DOWNLOAD_PARALLELS <= 4 else 4
+        # download every part
         with ThreadPoolExecutor(
                 max_workers=parallels,
                 thread_name_prefix='download') as executor:
             list(executor.map(download_part_with_retry, tasks))
 
-        progress.close()
+    # merge parts.
+    with open(os.path.join(local_dir, file_name), 'wb') as output_file:
+        for task in tasks:
+            part_file_name = task[0] + '_%s_%s' % (task[2], task[3])
+            with open(part_file_name, 'rb') as part_file:
+                output_file.write(part_file.read())
+            os.remove(part_file_name)
 
-    os.replace(temp_file.name, os.path.join(local_dir, file_name))
+
+def http_get_model_file(
+    url: str,
+    local_dir: str,
+    file_name: str,
+    file_size: int,
+    cookies: CookieJar,
+    headers: Optional[Dict[str, str]] = None,
+):
+    """Download remote file, will retry 5 times before giving up on errors.
+
+    Args:
+        url(str):
+            actual download url of the file
+        local_dir(str):
+            local directory where the downloaded file stores
+        file_name(str):
+            name of the file stored in `local_dir`
+        file_size(int):
+            The file size.
+        cookies(CookieJar):
+            cookies used to authentication the user, which is used for downloading private repos
+        headers(Dict[str, str], optional):
+            http headers to carry necessary info when requesting the remote file
+
+    Raises:
+        FileDownloadError: File download failed.
+
+    """
+    get_headers = {} if headers is None else copy.deepcopy(headers)
+    get_headers['X-Request-ID'] = str(uuid.uuid4().hex)
+    temp_file_path = os.path.join(local_dir, file_name)
+    os.makedirs(os.path.dirname(temp_file_path), exist_ok=True)
+    logger.debug('downloading %s to %s', url, temp_file_path)
+    # retry sleep 0.5s, 1s, 2s, 4s
+    retry = Retry(
+        total=API_FILE_DOWNLOAD_RETRY_TIMES,
+        backoff_factor=1,
+        allowed_methods=['GET'])
+    while True:
+        try:
+            with tqdm(
+                    unit='B',
+                    unit_scale=True,
+                    unit_divisor=1024,
+                    total=file_size if file_size > 0 else 1,
+                    initial=0,
+                    desc='Downloading [' + file_name + ']',
+                    leave=True,
+            ) as progress:
+                if file_size == 0:
+                    # Avoid empty file server request
+                    with open(temp_file_path, 'w+'):
+                        progress.update(1)
+                    break
+                # Determine the length of any existing partial download
+                partial_length = 0
+                # download partial, continue download
+                if os.path.exists(temp_file_path):
+                    with open(temp_file_path, 'rb') as f:
+                        partial_length = f.seek(0, io.SEEK_END)
+                        progress.update(partial_length)
+
+                # Check if download is complete
+                if partial_length >= file_size:
+                    break
+                # closed range[], from 0.
+                get_headers['Range'] = 'bytes=%s-%s' % (partial_length,
+                                                        file_size - 1)
+                with open(temp_file_path, 'ab+') as f:
+                    r = requests.get(
+                        url,
+                        stream=True,
+                        headers=get_headers,
+                        cookies=cookies,
+                        timeout=API_FILE_DOWNLOAD_TIMEOUT)
+                    r.raise_for_status()
+                    for chunk in r.iter_content(
+                            chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE):
+                        if chunk:  # filter out keep-alive new chunks
+                            progress.update(len(chunk))
+                            f.write(chunk)
+            break
+        except (Exception) as e:  # no matter what happen, we will retry.
+            retry = retry.increment('GET', url, error=e)
+            retry.sleep()
+
+    logger.debug('storing %s in cache at %s', url, local_dir)
 
 
 def http_get_file(
@@ -319,7 +564,7 @@ def http_get_file(
                     unit_divisor=1024,
                     total=total,
                     initial=downloaded_size,
-                    desc='Downloading',
+                    desc='Downloading [' + file_name + ']',
                 )
                 for chunk in r.iter_content(
                         chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE):
@@ -342,3 +587,31 @@ def http_get_file(
         logger.error(msg)
         raise FileDownloadError(msg)
     os.replace(temp_file.name, os.path.join(local_dir, file_name))
+
+
+def download_file(url, file_meta, temporary_cache_dir, cache, headers,
+                  cookies):
+    if MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB * 1000 * 1000 < file_meta[
+            'Size'] and MODELSCOPE_DOWNLOAD_PARALLELS > 1:  # parallel download large file.
+        parallel_download(
+            url,
+            temporary_cache_dir,
+            file_meta['Path'],
+            headers=headers,
+            cookies=None if cookies is None else cookies.get_dict(),
+            file_size=file_meta['Size'])
+    else:
+        http_get_model_file(
+            url,
+            temporary_cache_dir,
+            file_meta['Path'],
+            file_size=file_meta['Size'],
+            headers=headers,
+            cookies=cookies)
+
+    # check file integrity
+    temp_file = os.path.join(temporary_cache_dir, file_meta['Path'])
+    if FILE_HASH in file_meta:
+        file_integrity_validation(temp_file, file_meta[FILE_HASH])
+    # put file into to cache
+    return cache.put_file(file_meta, temp_file)
diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py
index b0fae148b..144d9d695 100644
--- a/modelscope/hub/git.py
+++ b/modelscope/hub/git.py
@@ -45,8 +45,9 @@ def _run_git_command(self, *args) -> subprocess.CompletedProcess:
         logger.debug(' '.join(args))
         git_env = os.environ.copy()
         git_env['GIT_TERMINAL_PROMPT'] = '0'
+        command = [self.git_path, *args]
         response = subprocess.run(
-            [self.git_path, *args],
+            command,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
             env=git_env,
@@ -55,10 +56,18 @@ def _run_git_command(self, *args) -> subprocess.CompletedProcess:
             response.check_returncode()
             return response
         except subprocess.CalledProcessError as error:
-            logger.error('There are error run git command.')
-            raise GitError(
-                'stdout: %s, stderr: %s' %
-                (response.stdout.decode('utf8'), error.stderr.decode('utf8')))
+            std_out = response.stdout.decode('utf8')
+            std_err = error.stderr.decode('utf8')
+            if 'nothing to commit' in std_out:
+                logger.info(
+                    'Nothing to commit, your local repo is upto date with remote'
+                )
+                return response
+            else:
+                logger.error(
+                    'Running git command: %s failed \n stdout: %s \n stderr: %s'
+                    % (command, std_out, std_err))
+                raise GitError(std_err)
 
     def config_auth_token(self, repo_dir, auth_token):
         url = self.get_repo_remote_url(repo_dir)
diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py
index 82313b15c..015cadbd3 100644
--- a/modelscope/hub/snapshot_download.py
+++ b/modelscope/hub/snapshot_download.py
@@ -1,33 +1,46 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
+import fnmatch
 import os
 import re
-import tempfile
+import uuid
+from concurrent.futures import ThreadPoolExecutor
 from http.cookiejar import CookieJar
 from pathlib import Path
 from typing import Dict, List, Optional, Union
 
+from tqdm.auto import tqdm
+
 from modelscope.hub.api import HubApi, ModelScopeConfig
-from modelscope.utils.constant import DEFAULT_MODEL_REVISION
+from modelscope.hub.errors import InvalidParameter
+from modelscope.hub.utils.caching import ModelFileSystemCache
+from modelscope.hub.utils.utils import (get_model_masked_directory,
+                                        model_id_to_group_owner_name)
+from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
+                                       DEFAULT_MODEL_REVISION,
+                                       REPO_TYPE_DATASET, REPO_TYPE_MODEL,
+                                       REPO_TYPE_SUPPORT)
 from modelscope.utils.logger import get_logger
-from .constants import (FILE_HASH, MODELSCOPE_DOWNLOAD_PARALLELS,
-                        MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB)
-from .file_download import (get_file_download_url, http_get_file,
-                            parallel_download)
-from .utils.caching import ModelFileSystemCache
-from .utils.utils import (file_integrity_validation, get_cache_dir,
-                          model_id_to_group_owner_name)
+from .file_download import (create_temporary_directory_and_cache,
+                            download_file, get_file_download_url)
 
 logger = get_logger()
 
 
-def snapshot_download(model_id: str,
-                      revision: Optional[str] = DEFAULT_MODEL_REVISION,
-                      cache_dir: Union[str, Path, None] = None,
-                      user_agent: Optional[Union[Dict, str]] = None,
-                      local_files_only: Optional[bool] = False,
-                      cookies: Optional[CookieJar] = None,
-                      ignore_file_pattern: List = None) -> str:
+def snapshot_download(
+    model_id: str,
+    revision: Optional[str] = DEFAULT_MODEL_REVISION,
+    cache_dir: Union[str, Path, None] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    local_files_only: Optional[bool] = False,
+    cookies: Optional[CookieJar] = None,
+    ignore_file_pattern: Optional[Union[str, List[str]]] = None,
+    allow_file_pattern: Optional[Union[str, List[str]]] = None,
+    local_dir: Optional[str] = None,
+    allow_patterns: Optional[Union[List[str], str]] = None,
+    ignore_patterns: Optional[Union[List[str], str]] = None,
+    max_workers: int = 8,
+) -> str:
     """Download all files of a repo.
     Downloads a whole snapshot of a repo's files at the specified revision. This
     is useful when you want all files from a repo, because you don't know which
@@ -41,13 +54,101 @@ def snapshot_download(model_id: str,
         model_id (str): A user or an organization name and a repo name separated by a `/`.
         revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a
             commit hash. NOTE: currently only branch and tag name is supported
-        cache_dir (str, Path, optional): Path to the folder where cached files are stored.
+        cache_dir (str, Path, optional): Path to the folder where cached files are stored, model will
+            be save as cache_dir/model_id/THE_MODEL_FILES.
+        user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string.
+        local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the
+            local cached file if it exists.
+        cookies (CookieJar, optional): The cookie of the request, default None.
+        ignore_file_pattern (`str` or `List`, *optional*, default to `None`):
+            Any file pattern to be ignored in downloading, like exact file names or file extensions.
+        allow_file_pattern (`str` or `List`, *optional*, default to `None`):
+            Any file pattern to be downloading, like exact file names or file extensions.
+        local_dir (str, optional): Specific local directory path to which the file will be downloaded.
+        allow_patterns (`str` or `List`, *optional*, default to `None`):
+            If provided, only files matching at least one pattern are downloaded, priority over allow_file_pattern.
+            For hugging-face compatibility.
+        ignore_patterns (`str` or `List`, *optional*, default to `None`):
+            If provided, files matching any of the patterns are not downloaded, priority over ignore_file_pattern.
+            For hugging-face compatibility.
+        max_workers (`int`): The maximum number of workers to download files, default 8.
+    Raises:
+        ValueError: the value details.
+
+    Returns:
+        str: Local folder path (string) of repo snapshot
+
+    Note:
+        Raises the following errors:
+        - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
+        if `use_auth_token=True` and the token cannot be found.
+        - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
+        ETag cannot be determined.
+        - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
+        if some parameter value is invalid
+    """
+    return _snapshot_download(
+        model_id,
+        repo_type=REPO_TYPE_MODEL,
+        revision=revision,
+        cache_dir=cache_dir,
+        user_agent=user_agent,
+        local_files_only=local_files_only,
+        cookies=cookies,
+        ignore_file_pattern=ignore_file_pattern,
+        allow_file_pattern=allow_file_pattern,
+        local_dir=local_dir,
+        ignore_patterns=ignore_patterns,
+        allow_patterns=allow_patterns,
+        max_workers=max_workers)
+
+
+def dataset_snapshot_download(
+    dataset_id: str,
+    revision: Optional[str] = DEFAULT_DATASET_REVISION,
+    cache_dir: Union[str, Path, None] = None,
+    local_dir: Optional[str] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    local_files_only: Optional[bool] = False,
+    cookies: Optional[CookieJar] = None,
+    ignore_file_pattern: Optional[Union[str, List[str]]] = None,
+    allow_file_pattern: Optional[Union[str, List[str]]] = None,
+    allow_patterns: Optional[Union[List[str], str]] = None,
+    ignore_patterns: Optional[Union[List[str], str]] = None,
+    max_workers: int = 8,
+) -> str:
+    """Download raw files of a dataset.
+    Downloads all files at the specified revision. This
+    is useful when you want all files from a dataset, because you don't know which
+    ones you will need a priori. All files are nested inside a folder in order
+    to keep their actual filename relative to that folder.
+
+    An alternative would be to just clone a dataset but this would require that the
+    user always has git and git-lfs installed, and properly configured.
+
+    Args:
+        dataset_id (str): A user or an organization name and a dataset name separated by a `/`.
+        revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a
+            commit hash. NOTE: currently only branch and tag name is supported
+        cache_dir (str, Path, optional): Path to the folder where cached files are stored, dataset will
+            be save as cache_dir/dataset_id/THE_DATASET_FILES.
+        local_dir (str, optional): Specific local directory path to which the file will be downloaded.
         user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string.
         local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the
             local cached file if it exists.
         cookies (CookieJar, optional): The cookie of the request, default None.
         ignore_file_pattern (`str` or `List`, *optional*, default to `None`):
             Any file pattern to be ignored in downloading, like exact file names or file extensions.
+            Use regression is deprecated.
+        allow_file_pattern (`str` or `List`, *optional*, default to `None`):
+            Any file pattern to be downloading, like exact file names or file extensions.
+        allow_patterns (`str` or `List`, *optional*, default to `None`):
+            If provided, only files matching at least one pattern are downloaded, priority over allow_file_pattern.
+            For hugging-face compatibility.
+        ignore_patterns (`str` or `List`, *optional*, default to `None`):
+            If provided, files matching any of the patterns are not downloaded, priority over ignore_file_pattern.
+            For hugging-face compatibility.
+        max_workers (`int`): The maximum number of workers to download files, default 8.
     Raises:
         ValueError: the value details.
 
@@ -63,22 +164,54 @@ def snapshot_download(model_id: str,
         - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
         if some parameter value is invalid
     """
+    return _snapshot_download(
+        dataset_id,
+        repo_type=REPO_TYPE_DATASET,
+        revision=revision,
+        cache_dir=cache_dir,
+        user_agent=user_agent,
+        local_files_only=local_files_only,
+        cookies=cookies,
+        ignore_file_pattern=ignore_file_pattern,
+        allow_file_pattern=allow_file_pattern,
+        local_dir=local_dir,
+        ignore_patterns=ignore_patterns,
+        allow_patterns=allow_patterns,
+        max_workers=max_workers)
 
-    if cache_dir is None:
-        cache_dir = get_cache_dir()
-    if isinstance(cache_dir, Path):
-        cache_dir = str(cache_dir)
-    temporary_cache_dir = os.path.join(cache_dir, 'temp')
-    os.makedirs(temporary_cache_dir, exist_ok=True)
 
-    group_or_owner, name = model_id_to_group_owner_name(model_id)
+def _snapshot_download(
+    repo_id: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = DEFAULT_MODEL_REVISION,
+    cache_dir: Union[str, Path, None] = None,
+    user_agent: Optional[Union[Dict, str]] = None,
+    local_files_only: Optional[bool] = False,
+    cookies: Optional[CookieJar] = None,
+    ignore_file_pattern: Optional[Union[str, List[str]]] = None,
+    allow_file_pattern: Optional[Union[str, List[str]]] = None,
+    local_dir: Optional[str] = None,
+    allow_patterns: Optional[Union[List[str], str]] = None,
+    ignore_patterns: Optional[Union[List[str], str]] = None,
+    max_workers: int = 8,
+):
+    if not repo_type:
+        repo_type = REPO_TYPE_MODEL
+    if repo_type not in REPO_TYPE_SUPPORT:
+        raise InvalidParameter('Invalid repo type: %s, only support: %s' %
+                               (repo_type, REPO_TYPE_SUPPORT))
 
-    cache = ModelFileSystemCache(cache_dir, group_or_owner, name)
+    temporary_cache_dir, cache = create_temporary_directory_and_cache(
+        repo_id, local_dir=local_dir, cache_dir=cache_dir, repo_type=repo_type)
+    system_cache = cache_dir if cache_dir is not None else os.getenv(
+        'MODELSCOPE_CACHE',
+        Path.home().joinpath('.cache', 'modelscope'))
     if local_files_only:
         if len(cache.cached_files) == 0:
             raise ValueError(
                 'Cannot find the requested files in the cached path and outgoing'
-                ' traffic has been disabled. To enable model look-ups and downloads'
+                ' traffic has been disabled. To enable look-ups and downloads'
                 " online, set 'local_files_only' to False.")
         logger.warning('We can not confirm the cached file is for revision: %s'
                        % revision)
@@ -88,76 +221,281 @@ def snapshot_download(model_id: str,
         # make headers
         headers = {
             'user-agent':
-            ModelScopeConfig.get_user_agent(user_agent=user_agent, )
+            ModelScopeConfig.get_user_agent(user_agent=user_agent, ),
         }
+        if 'CI_TEST' not in os.environ:
+            # To count the download statistics, to add the snapshot-identifier as a header.
+            headers['snapshot-identifier'] = str(uuid.uuid4())
         _api = HubApi()
         if cookies is None:
             cookies = ModelScopeConfig.get_cookies()
-        revision = _api.get_valid_revision(
-            model_id, revision=revision, cookies=cookies)
+        repo_files = []
+        if repo_type == REPO_TYPE_MODEL:
+            directory = os.path.abspath(
+                local_dir) if local_dir is not None else os.path.join(
+                    system_cache, 'hub', repo_id)
+            print(f'Downloading Model to directory: {directory}')
+            revision_detail = _api.get_valid_revision_detail(
+                repo_id, revision=revision, cookies=cookies)
+            revision = revision_detail['Revision']
 
-        snapshot_header = headers if 'CI_TEST' in os.environ else {
-            **headers,
-            **{
-                'Snapshot': 'True'
+            snapshot_header = headers if 'CI_TEST' in os.environ else {
+                **headers,
+                **{
+                    'Snapshot': 'True'
+                }
             }
-        }
-        model_files = _api.get_model_files(
-            model_id=model_id,
+            if cache.cached_model_revision is not None:
+                snapshot_header[
+                    'cached_model_revision'] = cache.cached_model_revision
+
+            repo_files = _api.get_model_files(
+                model_id=repo_id,
+                revision=revision,
+                recursive=True,
+                use_cookies=False if cookies is None else cookies,
+                headers=snapshot_header,
+            )
+            _download_file_lists(
+                repo_files,
+                cache,
+                temporary_cache_dir,
+                repo_id,
+                _api,
+                None,
+                None,
+                headers,
+                repo_type=repo_type,
+                revision=revision,
+                cookies=cookies,
+                ignore_file_pattern=ignore_file_pattern,
+                allow_file_pattern=allow_file_pattern,
+                ignore_patterns=ignore_patterns,
+                allow_patterns=allow_patterns,
+                max_workers=max_workers)
+            if '.' in repo_id:
+                masked_directory = get_model_masked_directory(
+                    directory, repo_id)
+                if os.path.exists(directory):
+                    logger.info(
+                        'Target directory already exists, skipping creation.')
+                else:
+                    logger.info(f'Creating symbolic link [{directory}].')
+                    try:
+                        os.symlink(
+                            os.path.abspath(masked_directory), directory)
+                    except OSError:
+                        logger.warning(
+                            f'Failed to create symbolic link {directory}.')
+
+        elif repo_type == REPO_TYPE_DATASET:
+            directory = os.path.abspath(
+                local_dir) if local_dir else os.path.join(
+                    system_cache, 'datasets', repo_id)
+            print(f'Downloading Dataset to directory: {directory}')
+
+            group_or_owner, name = model_id_to_group_owner_name(repo_id)
+            revision_detail = revision or DEFAULT_DATASET_REVISION
+
+            logger.info('Fetching dataset repo file list...')
+            repo_files = fetch_repo_files(_api, name, group_or_owner,
+                                          revision_detail)
+
+            if repo_files is None:
+                logger.error(
+                    f'Failed to retrieve file list for dataset: {repo_id}')
+                return None
+
+            _download_file_lists(
+                repo_files,
+                cache,
+                temporary_cache_dir,
+                repo_id,
+                _api,
+                name,
+                group_or_owner,
+                headers,
+                repo_type=repo_type,
+                revision=revision,
+                cookies=cookies,
+                ignore_file_pattern=ignore_file_pattern,
+                allow_file_pattern=allow_file_pattern,
+                ignore_patterns=ignore_patterns,
+                allow_patterns=allow_patterns,
+                max_workers=max_workers)
+
+        cache.save_model_version(revision_info=revision_detail)
+        cache_root_path = cache.get_root_location()
+        return cache_root_path
+
+
+def fetch_repo_files(_api, name, group_or_owner, revision):
+    page_number = 1
+    page_size = 150
+    repo_files = []
+
+    while True:
+        files_list_tree = _api.list_repo_tree(
+            dataset_name=name,
+            namespace=group_or_owner,
             revision=revision,
+            root_path='/',
             recursive=True,
-            use_cookies=False if cookies is None else cookies,
-            headers=snapshot_header,
-        )
-
-        if ignore_file_pattern is None:
-            ignore_file_pattern = []
-        if isinstance(ignore_file_pattern, str):
-            ignore_file_pattern = [ignore_file_pattern]
-
-        with tempfile.TemporaryDirectory(
-                dir=temporary_cache_dir) as temp_cache_dir:
-            for model_file in model_files:
-                if model_file['Type'] == 'tree' or \
-                        any([re.search(pattern, model_file['Name']) is not None for pattern in ignore_file_pattern]):
+            page_number=page_number,
+            page_size=page_size)
+
+        if not ('Code' in files_list_tree and files_list_tree['Code'] == 200):
+            logger.error(f'Get dataset file list failed, request_id:  \
+                {files_list_tree["RequestId"]}, message: {files_list_tree["Message"]}'
+                         )
+            return None
+
+        cur_repo_files = files_list_tree['Data']['Files']
+        repo_files.extend(cur_repo_files)
+
+        if len(cur_repo_files) < page_size:
+            break
+
+        page_number += 1
+
+    return repo_files
+
+
+def _is_valid_regex(pattern: str):
+    try:
+        re.compile(pattern)
+        return True
+    except BaseException:
+        return False
+
+
+def _normalize_patterns(patterns: Union[str, List[str]]):
+    if isinstance(patterns, str):
+        patterns = [patterns]
+    if patterns is not None:
+        patterns = [
+            item if not item.endswith('/') else item + '*' for item in patterns
+        ]
+    return patterns
+
+
+def _get_valid_regex_pattern(patterns: List[str]):
+    if patterns is not None:
+        regex_patterns = []
+        for item in patterns:
+            if _is_valid_regex(item):
+                regex_patterns.append(item)
+        return regex_patterns
+    else:
+        return None
+
+
+def thread_download(func, iterable, max_workers, **kwargs):
+    # Create a tqdm progress bar with the total number of files to fetch
+    with tqdm(
+            total=len(iterable),
+            desc=f'Fetching {len(iterable)} files') as pbar:
+        # Define a wrapper function to update the progress bar
+        def progress_wrapper(*args, **kwargs):
+            result = func(*args, **kwargs)
+            pbar.update(1)
+            return result
+
+        with ThreadPoolExecutor(max_workers=max_workers) as executor:
+            executor.map(progress_wrapper, iterable)
+
+
+def _download_file_lists(
+        repo_files: List[str],
+        cache: ModelFileSystemCache,
+        temporary_cache_dir: str,
+        repo_id: str,
+        api: HubApi,
+        name: str,
+        group_or_owner: str,
+        headers,
+        repo_type: Optional[str] = None,
+        revision: Optional[str] = DEFAULT_MODEL_REVISION,
+        cookies: Optional[CookieJar] = None,
+        ignore_file_pattern: Optional[Union[str, List[str]]] = None,
+        allow_file_pattern: Optional[Union[str, List[str]]] = None,
+        allow_patterns: Optional[Union[List[str], str]] = None,
+        ignore_patterns: Optional[Union[List[str], str]] = None,
+        max_workers: int = 8):
+    ignore_patterns = _normalize_patterns(ignore_patterns)
+    allow_patterns = _normalize_patterns(allow_patterns)
+    ignore_file_pattern = _normalize_patterns(ignore_file_pattern)
+    allow_file_pattern = _normalize_patterns(allow_file_pattern)
+    # to compatible regex usage.
+    ignore_regex_pattern = _get_valid_regex_pattern(ignore_file_pattern)
+
+    filtered_repo_files = []
+    for repo_file in repo_files:
+        if repo_file['Type'] == 'tree':
+            continue
+        try:
+            # processing patterns
+            if ignore_patterns and any([
+                    fnmatch.fnmatch(repo_file['Path'], pattern)
+                    for pattern in ignore_patterns
+            ]):
+                continue
+
+            if ignore_file_pattern and any([
+                    fnmatch.fnmatch(repo_file['Path'], pattern)
+                    for pattern in ignore_file_pattern
+            ]):
+                continue
+
+            if ignore_regex_pattern and any([
+                    re.search(pattern, repo_file['Name']) is not None
+                    for pattern in ignore_regex_pattern
+            ]):  # noqa E501
+                continue
+
+            if allow_patterns is not None and allow_patterns:
+                if not any(
+                        fnmatch.fnmatch(repo_file['Path'], pattern)
+                        for pattern in allow_patterns):
                     continue
-                # check model_file is exist in cache, if existed, skip download, otherwise download
-                if cache.exists(model_file):
-                    file_name = os.path.basename(model_file['Name'])
-                    logger.debug(
-                        f'File {file_name} already in cache, skip downloading!'
-                    )
+
+            if allow_file_pattern is not None and allow_file_pattern:
+                if not any(
+                        fnmatch.fnmatch(repo_file['Path'], pattern)
+                        for pattern in allow_file_pattern):
                     continue
+            # check model_file is exist in cache, if existed, skip download
+            if cache.exists(repo_file):
+                file_name = os.path.basename(repo_file['Name'])
+                logger.debug(
+                    f'File {file_name} already in cache with identical hash, skip downloading!'
+                )
+                continue
+        except Exception as e:
+            logger.warning('The file pattern is invalid : %s' % e)
+        else:
+            filtered_repo_files.append(repo_file)
 
-                # get download url
-                url = get_file_download_url(
-                    model_id=model_id,
-                    file_path=model_file['Path'],
-                    revision=revision)
-
-                if MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB * 1000 * 1000 < model_file[
-                        'Size'] and MODELSCOPE_DOWNLOAD_PARALLELS > 1:
-                    parallel_download(
-                        url,
-                        temp_cache_dir,
-                        model_file['Name'],
-                        headers=headers,
-                        cookies=None
-                        if cookies is None else cookies.get_dict(),
-                        file_size=model_file['Size'])
-                else:
-                    http_get_file(
-                        url,
-                        temp_cache_dir,
-                        model_file['Name'],
-                        headers=headers,
-                        cookies=cookies)
-
-                # check file integrity
-                temp_file = os.path.join(temp_cache_dir, model_file['Name'])
-                if FILE_HASH in model_file:
-                    file_integrity_validation(temp_file, model_file[FILE_HASH])
-                # put file to cache
-                cache.put_file(model_file, temp_file)
-
-        return os.path.join(cache.get_root_location())
+    def _download_single_file(repo_file):
+        if repo_type == REPO_TYPE_MODEL:
+            url = get_file_download_url(
+                model_id=repo_id,
+                file_path=repo_file['Path'],
+                revision=revision)
+        elif repo_type == REPO_TYPE_DATASET:
+            url = api.get_dataset_file_url(
+                file_name=repo_file['Path'],
+                dataset_name=name,
+                namespace=group_or_owner,
+                revision=revision)
+        else:
+            raise InvalidParameter(
+                f'Invalid repo type: {repo_type}, supported types: {REPO_TYPE_SUPPORT}'
+            )
+        download_file(url, repo_file, temporary_cache_dir, cache, headers,
+                      cookies)
+
+    if len(filtered_repo_files) > 0:
+        thread_download(_download_single_file, filtered_repo_files,
+                        max_workers)
+        logger.info(f"Download {repo_type} '{repo_id}' successfully.")
diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py
index f92aaaf46..e1dcf83bc 100644
--- a/modelscope/hub/utils/caching.py
+++ b/modelscope/hub/utils/caching.py
@@ -5,17 +5,26 @@
 import pickle
 import tempfile
 from shutil import move, rmtree
+from typing import Dict
 
-from modelscope.hub.constants import MODEL_META_FILE_NAME, MODEL_META_MODEL_ID
+from modelscope.hub.constants import (  # noqa
+    FILE_HASH, MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION)
+from modelscope.hub.utils.utils import compute_hash
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
+
+enable_default_hash_validation = \
+    os.getenv(MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION, 'False').strip().lower() == 'true'
 """Implements caching functionality, used internally only
 """
 
 
 class FileSystemCache(object):
     KEY_FILE_NAME = '.msc'
+    MODEL_META_FILE_NAME = '.mdl'
+    MODEL_META_MODEL_ID = 'id'
+    MODEL_VERSION_FILE_NAME = '.mv'
     """Local file cache.
     """
 
@@ -133,24 +142,47 @@ def __init__(self, cache_root, owner=None, name=None):
             self.load_model_meta()
         else:
             super().__init__(os.path.join(cache_root, owner, name))
-            self.model_meta = {MODEL_META_MODEL_ID: '%s/%s' % (owner, name)}
+            self.model_meta = {
+                FileSystemCache.MODEL_META_MODEL_ID: '%s/%s' % (owner, name)
+            }
             self.save_model_meta()
+        self.cached_model_revision = self.load_model_version()
 
     def load_model_meta(self):
         meta_file_path = os.path.join(self.cache_root_location,
-                                      MODEL_META_FILE_NAME)
+                                      FileSystemCache.MODEL_META_FILE_NAME)
         if os.path.exists(meta_file_path):
             with open(meta_file_path, 'rb') as f:
                 self.model_meta = pickle.load(f)
         else:
-            self.model_meta = {MODEL_META_MODEL_ID: 'unknown'}
+            self.model_meta = {FileSystemCache.MODEL_META_MODEL_ID: 'unknown'}
+
+    def load_model_version(self):
+        model_version_file_path = os.path.join(
+            self.cache_root_location, FileSystemCache.MODEL_VERSION_FILE_NAME)
+        if os.path.exists(model_version_file_path):
+            with open(model_version_file_path, 'r') as f:
+                return f.read().strip()
+        else:
+            return None
+
+    def save_model_version(self, revision_info: Dict):
+        model_version_file_path = os.path.join(
+            self.cache_root_location, FileSystemCache.MODEL_VERSION_FILE_NAME)
+        with open(model_version_file_path, 'w') as f:
+            if isinstance(revision_info, dict):
+                version_info_str = 'Revision:%s,CreatedAt:%s' % (
+                    revision_info['Revision'], revision_info['CreatedAt'])
+                f.write(version_info_str)
+            else:
+                f.write(revision_info)
 
     def get_model_id(self):
-        return self.model_meta[MODEL_META_MODEL_ID]
+        return self.model_meta[FileSystemCache.MODEL_META_MODEL_ID]
 
     def save_model_meta(self):
         meta_file_path = os.path.join(self.cache_root_location,
-                                      MODEL_META_FILE_NAME)
+                                      FileSystemCache.MODEL_META_FILE_NAME)
         with open(meta_file_path, 'wb') as f:
             pickle.dump(self.model_meta, f)
 
@@ -226,26 +258,40 @@ def __get_cache_key(self, model_file_info):
         return cache_key
 
     def exists(self, model_file_info):
-        """Check the file is cached or not.
+        """Check the file is cached or not. Note existence check will also cover digest check
 
         Args:
             model_file_info (CachedFileInfo): The cached file info
 
         Returns:
-            bool: If exists return True otherwise False
+            bool: If exists and has the same hash, return True otherwise False
         """
         key = self.__get_cache_key(model_file_info)
         is_exists = False
+        file_path = key['Path']
+        cache_file_path = os.path.join(self.cache_root_location,
+                                       model_file_info['Path'])
         for cached_key in self.cached_files:
-            if cached_key['Path'] == key['Path'] and (
+            if cached_key['Path'] == file_path and (
                     cached_key['Revision'].startswith(key['Revision'])
                     or key['Revision'].startswith(cached_key['Revision'])):
-                is_exists = True
-                break
-        file_path = os.path.join(self.cache_root_location,
-                                 model_file_info['Path'])
+                expected_hash = model_file_info[FILE_HASH]
+                if expected_hash is not None and os.path.exists(
+                        cache_file_path):
+                    # compute hash only when enabled, otherwise just meet expectation by default
+                    if enable_default_hash_validation:
+                        cache_file_sha256 = compute_hash(cache_file_path)
+                    else:
+                        cache_file_sha256 = expected_hash
+                    if expected_hash == cache_file_sha256:
+                        is_exists = True
+                        break
+                    else:
+                        logger.info(
+                            f'File [{file_path}] exists in cache but with a mismatched hash, will re-download.'
+                        )
         if is_exists:
-            if os.path.exists(file_path):
+            if os.path.exists(cache_file_path):
                 return True
             else:
                 self.remove_key(
diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py
index 31e6e72c0..bb38f26ac 100644
--- a/modelscope/hub/utils/utils.py
+++ b/modelscope/hub/utils/utils.py
@@ -3,6 +3,7 @@
 import hashlib
 import os
 from datetime import datetime
+from pathlib import Path
 from typing import Optional
 
 import requests
@@ -12,7 +13,7 @@
                                       MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG,
                                       MODELSCOPE_URL_SCHEME)
 from modelscope.hub.errors import FileIntegrityError
-from modelscope.utils.file_utils import get_default_cache_dir
+from modelscope.utils.file_utils import get_default_modelscope_cache_dir
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -28,17 +29,48 @@ def model_id_to_group_owner_name(model_id):
     return group_or_owner, name
 
 
+# during model download, the '.' would be converted to '___' to produce
+# actual physical (masked) directory for storage
+def get_model_masked_directory(directory, model_id):
+    parts = directory.rsplit('/', 2)
+    # this is the actual directory the model files are located.
+    masked_directory = os.path.join(parts[0], model_id.replace('.', '___'))
+    return masked_directory
+
+
+def convert_readable_size(size_bytes):
+    import math
+    if size_bytes == 0:
+        return '0B'
+    size_name = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB')
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return f'{s} {size_name[i]}'
+
+
+def get_folder_size(folder_path):
+    total_size = 0
+    for path in Path(folder_path).rglob('*'):
+        if path.is_file():
+            total_size += path.stat().st_size
+    return total_size
+
+
+# return a readable string that describe size of for a given folder (MB, GB etc.)
+def get_readable_folder_size(folder_path) -> str:
+    return convert_readable_size(get_folder_size(folder_path=folder_path))
+
+
 def get_cache_dir(model_id: Optional[str] = None):
     """cache dir precedence:
         function parameter > environment > ~/.cache/modelscope/hub
-
     Args:
         model_id (str, optional): The model id.
-
     Returns:
         str: the model_id dir if model_id not None, otherwise cache root dir.
     """
-    default_cache_dir = get_default_cache_dir()
+    default_cache_dir = Path.home().joinpath('.cache', 'modelscope')
     base_path = os.getenv('MODELSCOPE_CACHE',
                           os.path.join(default_cache_dir, 'hub'))
     return base_path if model_id is None else os.path.join(
@@ -89,6 +121,7 @@ def file_integrity_validation(file_path, expected_sha256):
     file_sha256 = compute_hash(file_path)
     if not file_sha256 == expected_sha256:
         os.remove(file_path)
-        msg = 'File %s integrity check failed, the download may be incomplete, please try again.' % file_path
+        msg = 'File %s integrity check failed, expected sha256 signature is %s, actual is %s, the download may be incomplete, please try again.' % (  # noqa E501
+            file_path, expected_sha256, file_sha256)
         logger.error(msg)
         raise FileIntegrityError(msg)
diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py
index 87d5f3129..8166e004c 100644
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -39,6 +39,7 @@ class Models(object):
     body_3d_keypoints_hdformer = 'hdformer'
     crowd_counting = 'HRNetCrowdCounting'
     face_2d_keypoints = 'face-2d-keypoints'
+    star_68ldk_detection = 'star-68ldk-detection'
     panoptic_segmentation = 'swinL-panoptic-segmentation'
     r50_panoptic_segmentation = 'r50-panoptic-segmentation'
     image_reid_person = 'passvitb'
@@ -52,10 +53,13 @@ class Models(object):
     vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
     text_driven_segmentation = 'text-driven-segmentation'
     newcrfs_depth_estimation = 'newcrfs-depth-estimation'
+    omnidata_normal_estimation = 'omnidata-normal-estimation'
     panovit_layout_estimation = 'panovit-layout-estimation'
     unifuse_depth_estimation = 'unifuse-depth-estimation'
     s2net_depth_estimation = 's2net-depth-estimation'
     dro_resnet18_depth_estimation = 'dro-resnet18-depth-estimation'
+    raft_dense_optical_flow_estimation = 'raft-dense-optical-flow-estimation'
+    human_normal_estimation = 'human-normal-estimation'
     resnet50_bert = 'resnet50-bert'
     referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
     fer = 'fer'
@@ -87,12 +91,15 @@ class Models(object):
     video_object_segmentation = 'video-object-segmentation'
     video_deinterlace = 'video-deinterlace'
     quadtree_attention_image_matching = 'quadtree-attention-image-matching'
+    loftr_image_local_feature_matching = 'loftr-image-local-feature-matching'
+    lightglue_image_matching = 'lightglue-image-matching'
     vision_middleware = 'vision-middleware'
     vidt = 'vidt'
     video_stabilization = 'video-stabilization'
     real_basicvsr = 'real-basicvsr'
     rcp_sceneflow_estimation = 'rcp-sceneflow-estimation'
     image_casmvs_depth_estimation = 'image-casmvs-depth-estimation'
+    image_geomvsnet_depth_estimation = 'image-geomvsnet-depth-estimation'
     vop_retrieval_model = 'vop-retrieval-model'
     vop_retrieval_model_se = 'vop-retrieval-model-se'
     ddcolor = 'ddcolor'
@@ -127,6 +134,9 @@ class Models(object):
     human_image_generation = 'human-image-generation'
     image_view_transform = 'image-view-transform'
     image_control_3d_portrait = 'image-control-3d-portrait'
+    rife = 'rife'
+    anydoor = 'anydoor'
+    self_supervised_depth_completion = 'self-supervised-depth-completion'
 
     # nlp models
     bert = 'bert'
@@ -183,6 +193,7 @@ class Models(object):
     # audio models
     sambert_hifigan = 'sambert-hifigan'
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
     speech_dfsmn_ans = 'speech_dfsmn_ans'
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot'
@@ -195,9 +206,13 @@ class Models(object):
     generic_itn = 'generic-itn'
     generic_punc = 'generic-punc'
     generic_sv = 'generic-sv'
+    tdnn_sv = 'tdnn-sv'
     ecapa_tdnn_sv = 'ecapa-tdnn-sv'
     campplus_sv = 'cam++-sv'
     eres2net_sv = 'eres2net-sv'
+    eres2netv2_sv = 'eres2netv2-sv'
+    resnet_sv = 'resnet-sv'
+    res2net_sv = 'res2net-sv'
     eres2net_aug_sv = 'eres2net-aug-sv'
     scl_sd = 'scl-sd'
     scl_sd_xvector = 'scl-sd-xvector'
@@ -205,7 +220,11 @@ class Models(object):
     eres2net_lre = 'eres2net-lre'
     cluster_backend = 'cluster-backend'
     rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv'
+    sdpn_sv = 'sdpn_ecapa-sv'
     generic_lm = 'generic-lm'
+    audio_quantization = 'audio-quantization'
+    laura_codec = 'laura-codec'
+    funasr = 'funasr'
 
     # multi-modal models
     ofa = 'ofa'
@@ -326,6 +345,7 @@ class Pipelines(object):
     tinymog_face_detection = 'manual-face-detection-tinymog'
     facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
     facial_landmark_confidence = 'manual-facial-landmark-confidence-flcm'
+    facial_68ldk_detection = 'facial-68ldk-detection'
     face_attribute_recognition = 'resnet34-face-attribute-recognition-fairface'
     retina_face_detection = 'resnet50-face-detection-retinaface'
     mog_face_detection = 'resnet101-face-detection-cvpr22papermogface'
@@ -386,10 +406,13 @@ class Pipelines(object):
     language_guided_video_summarization = 'clip-it-video-summarization'
     image_semantic_segmentation = 'image-semantic-segmentation'
     image_depth_estimation = 'image-depth-estimation'
+    image_normal_estimation = 'image-normal-estimation'
     indoor_layout_estimation = 'indoor-layout-estimation'
+    image_local_feature_matching = 'image-local-feature-matching'
     video_depth_estimation = 'video-depth-estimation'
     panorama_depth_estimation = 'panorama-depth-estimation'
     panorama_depth_estimation_s2net = 'panorama-depth-estimation-s2net'
+    dense_optical_flow_estimation = 'dense-optical-flow-estimation'
     image_reid_person = 'passvitb-image-reid-person'
     image_inpainting = 'fft-inpainting'
     image_paintbyexample = 'stablediffusion-paintbyexample'
@@ -416,6 +439,7 @@ class Pipelines(object):
     video_object_segmentation = 'video-object-segmentation'
     video_deinterlace = 'video-deinterlace'
     image_matching = 'image-matching'
+    image_matching_fast = 'image-matching-fast'
     video_stabilization = 'video-stabilization'
     video_super_resolution = 'realbasicvsr-video-super-resolution'
     pointcloud_sceneflow_estimation = 'pointcloud-sceneflow-estimation'
@@ -447,6 +471,7 @@ class Pipelines(object):
     image_quality_assessment_degradation = 'image-quality-assessment-degradation'
     vision_efficient_tuning = 'vision-efficient-tuning'
     image_bts_depth_estimation = 'image-bts-depth-estimation'
+    image_depth_estimation_marigold = 'image-depth-estimation-marigold'
     pedestrian_attribute_recognition = 'resnet50_pedestrian-attribute-recognition_image'
     text_to_360panorama_image = 'text-to-360panorama-image'
     image_try_on = 'image-try-on'
@@ -455,6 +480,11 @@ class Pipelines(object):
     human3d_animation = 'human3d-animation'
     image_view_transform = 'image-view-transform'
     image_control_3d_portrait = 'image-control-3d-portrait'
+    rife_video_frame_interpolation = 'rife-video-frame-interpolation'
+    anydoor = 'anydoor'
+    image_to_3d = 'image-to-3d'
+    self_supervised_depth_completion = 'self-supervised-depth-completion'
+    human_normal_estimation = 'human-normal-estimation'
 
     # nlp tasks
     automatic_post_editing = 'automatic-post-editing'
@@ -522,21 +552,24 @@ class Pipelines(object):
     sambert_hifigan_tts = 'sambert-hifigan-tts'
     speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k'
     speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
+    speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base'
     speech_dfsmn_ans_psm_48k_causal = 'speech_dfsmn_ans_psm_48k_causal'
     speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
     speech_separation = 'speech-separation'
     kws_kwsbp = 'kws-kwsbp'
-    asr_inference = 'asr-inference'
     asr_wenet_inference = 'asr-wenet-inference'
     itn_inference = 'itn-inference'
-    punc_inference = 'punc-inference'
-    sv_inference = 'sv-inference'
     speaker_diarization_inference = 'speaker-diarization-inference'
     vad_inference = 'vad-inference'
     funasr_speech_separation = 'funasr-speech-separation'
     speaker_verification = 'speaker-verification'
+    speaker_verification_tdnn = 'speaker-verification-tdnn'
     speaker_verification_rdino = 'speaker-verification-rdino'
+    speaker_verification_sdpn = 'speaker-verification-sdpn'
     speaker_verification_eres2net = 'speaker-verification-eres2net'
+    speaker_verification_eres2netv2 = 'speaker-verification-eres2netv2'
+    speaker_verification_resnet = 'speaker-verification-resnet'
+    speaker_verification_res2net = 'speaker-verification-res2net'
     speech_language_recognition = 'speech-language-recognition'
     speech_language_recognition_eres2net = 'speech-language-recognition-eres2net'
     speaker_change_locating = 'speaker-change-locating'
@@ -545,6 +578,9 @@ class Pipelines(object):
     segmentation_clustering = 'segmentation-clustering'
     lm_inference = 'language-score-prediction'
     speech_timestamp_inference = 'speech-timestamp-inference'
+    audio_quantization = 'audio-quantization'
+    audio_quantization_inference = 'audio-quantization-inference'
+    laura_codec_tts_inference = 'laura-codec-tts-inference'
 
     # multi-modal tasks
     image_captioning = 'image-captioning'
@@ -584,6 +620,9 @@ class Pipelines(object):
     # science tasks
     protein_structure = 'unifold-protein-structure'
 
+    # funasr task
+    funasr_pipeline = 'funasr-pipeline'
+
 
 DEFAULT_MODEL_FOR_PIPELINE = {
     # TaskName: (pipeline_module_name, model_repo)
@@ -778,6 +817,12 @@ class Pipelines(object):
     Tasks.image_depth_estimation:
     (Pipelines.image_depth_estimation,
      'damo/cv_newcrfs_image-depth-estimation_indoor'),
+    Tasks.image_normal_estimation:
+    (Pipelines.image_normal_estimation,
+     'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'),
+    Tasks.human_normal_estimation:
+    (Pipelines.human_normal_estimation,
+     'Damo_XR_Lab/cv_human_monocular-normal-estimation'),
     Tasks.indoor_layout_estimation:
     (Pipelines.indoor_layout_estimation,
      'damo/cv_panovit_indoor-layout-estimation'),
@@ -787,6 +832,12 @@ class Pipelines(object):
     Tasks.panorama_depth_estimation:
     (Pipelines.panorama_depth_estimation,
      'damo/cv_unifuse_panorama-depth-estimation'),
+    Tasks.dense_optical_flow_estimation:
+    (Pipelines.dense_optical_flow_estimation,
+     'Damo_XR_Lab/cv_raft_dense-optical-flow_things'),
+    Tasks.image_local_feature_matching:
+    (Pipelines.image_local_feature_matching,
+     'Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data'),
     Tasks.image_style_transfer: (Pipelines.image_style_transfer,
                                  'damo/cv_aams_style-transfer_damo'),
     Tasks.face_image_generation: (Pipelines.face_image_generation,
@@ -804,20 +855,20 @@ class Pipelines(object):
     Tasks.image_to_image_generation:
     (Pipelines.image_to_image_generation,
      'damo/cv_latent_diffusion_image2image_generate'),
-    Tasks.image_classification:
-    (Pipelines.daily_image_classification,
-     'damo/cv_vit-base_image-classification_Dailylife-labels'),
-    Tasks.image_object_detection:
-    (Pipelines.image_object_detection_auto,
-     'damo/cv_yolox_image-object-detection-auto'),
-    Tasks.ocr_recognition:
-    (Pipelines.ocr_recognition,
-     'damo/cv_convnextTiny_ocr-recognition-general_damo'),
+    Tasks.image_classification: (
+        Pipelines.daily_image_classification,
+        'damo/cv_vit-base_image-classification_Dailylife-labels'),
+    Tasks.image_object_detection: (
+        Pipelines.image_object_detection_auto,
+        'damo/cv_yolox_image-object-detection-auto'),
+    Tasks.ocr_recognition: (
+        Pipelines.ocr_recognition,
+        'damo/cv_convnextTiny_ocr-recognition-general_damo'),
     Tasks.skin_retouching: (Pipelines.skin_retouching,
                             'damo/cv_unet_skin-retouching'),
-    Tasks.faq_question_answering:
-    (Pipelines.faq_question_answering,
-     'damo/nlp_structbert_faq-question-answering_chinese-base'),
+    Tasks.faq_question_answering: (
+        Pipelines.faq_question_answering,
+        'damo/nlp_structbert_faq-question-answering_chinese-base'),
     Tasks.crowd_counting: (Pipelines.crowd_counting,
                            'damo/cv_hrnet_crowd-counting_dcanet'),
     Tasks.video_single_object_tracking: (
@@ -936,7 +987,10 @@ class Pipelines(object):
                                  'damo/cv_image-view-transform'),
     Tasks.image_control_3d_portrait: (
         Pipelines.image_control_3d_portrait,
-        'damo/cv_vit_image-control-3d-portrait-synthesis')
+        'damo/cv_vit_image-control-3d-portrait-synthesis'),
+    Tasks.self_supervised_depth_completion: (
+        Pipelines.self_supervised_depth_completion,
+        'damo/self-supervised-depth-completion')
 }
 
 
@@ -959,6 +1013,7 @@ class CVTrainers(object):
     nerf_recon_4k = 'nerf-recon-4k'
     action_detection = 'action-detection'
     vision_efficient_tuning = 'vision-efficient-tuning'
+    self_supervised_depth_completion = 'self-supervised-depth-completion'
 
 
 class NLPTrainers(object):
diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py
index 75ccfcf96..e95a22fe1 100644
--- a/modelscope/metrics/__init__.py
+++ b/modelscope/metrics/__init__.py
@@ -69,7 +69,9 @@
         'loss_metric': ['LossMetric'],
         'image_colorization_metric': ['ImageColorizationMetric'],
         'ocr_recognition_metric': ['OCRRecognitionMetric'],
-        'translation_evaluation_metric': ['TranslationEvaluationMetric']
+        'translation_evaluation_metric': ['TranslationEvaluationMetric'],
+        'video_super_resolution_metric.video_super_resolution_metric':
+        ['VideoSuperResolutionMetric'],
     }
 
     import sys
diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py
index 24d86dfd5..e7e08ede9 100644
--- a/modelscope/models/__init__.py
+++ b/modelscope/models/__init__.py
@@ -1,9 +1,11 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
+from modelscope.utils.automodel_utils import fix_transformers_upgrade
 from modelscope.utils.error import (AUDIO_IMPORT_ERROR,
                                     TENSORFLOW_IMPORT_WARNING)
-from modelscope.utils.import_utils import is_torch_available
+from modelscope.utils.import_utils import (is_torch_available,
+                                           is_transformers_available)
 from . import audio, cv, multi_modal, nlp
 from .base import Head, Model
 from .builder import BACKBONES, HEADS, MODELS, build_model
@@ -11,3 +13,6 @@
 if is_torch_available():
     from .base.base_torch_model import TorchModel
     from .base.base_torch_head import TorchHead
+
+if is_transformers_available():
+    fix_transformers_upgrade()
diff --git a/modelscope/models/audio/ans/zipenhancer.py b/modelscope/models/audio/ans/zipenhancer.py
new file mode 100644
index 000000000..544d9dc74
--- /dev/null
+++ b/modelscope/models/audio/ans/zipenhancer.py
@@ -0,0 +1,210 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import random
+from typing import Dict
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.metainfo import Models
+from modelscope.models import TorchModel
+from modelscope.models.base import Tensor
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from .zipenhancer_layers.generator import (DenseEncoder, MappingDecoder,
+                                           PhaseDecoder)
+from .zipenhancer_layers.scaling import ScheduledFloat
+from .zipenhancer_layers.zipenhancer_layer import Zipformer2DualPathEncoder
+
+
+@MODELS.register_module(
+    Tasks.acoustic_noise_suppression,
+    module_name=Models.speech_zipenhancer_ans_multiloss_16k_base)
+class ZipenhancerDecorator(TorchModel):
+
+    def __init__(self, model_dir: str, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+        h = dict(
+            num_tsconformers=kwargs['num_tsconformers'],
+            dense_channel=kwargs['dense_channel'],
+            former_conf=kwargs['former_conf'],
+            batch_first=kwargs['batch_first'],
+            model_num_spks=kwargs['model_num_spks'],
+        )
+        # num_tsconformers, dense_channel, former_name, former_conf, batch_first, model_num_spks
+
+        h = AttrDict(h)
+        self.model = ZipEnhancer(h)
+        model_bin_file = os.path.join(model_dir,
+                                      ModelFile.TORCH_MODEL_BIN_FILE)
+        if os.path.exists(model_bin_file):
+            checkpoint = torch.load(
+                model_bin_file, map_location=torch.device('cpu'))
+            if isinstance(checkpoint, dict) and 'state_dict' in checkpoint:
+                # the new trained model by user is based on ZipenhancerDecorator
+                self.load_state_dict(checkpoint['state_dict'])
+            else:
+                # The released model on Modelscope is based on Zipenhancer
+                # self.model.load_state_dict(checkpoint, strict=False)
+                self.model.load_state_dict(checkpoint['generator'])
+                # print(checkpoint['generator'].keys())
+
+    def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]:
+        n_fft = 400
+        hop_size = 100
+        win_size = 400
+        noisy_wav = inputs['noisy']
+        norm_factor = torch.sqrt(noisy_wav.shape[1]
+                                 / torch.sum(noisy_wav**2.0))
+        noisy_audio = (noisy_wav * norm_factor)
+
+        mag, pha, com = mag_pha_stft(
+            noisy_audio,
+            n_fft,
+            hop_size,
+            win_size,
+            compress_factor=0.3,
+            center=True)
+        amp_g, pha_g, com_g, _, others = self.model.forward(mag, pha)
+        wav = mag_pha_istft(
+            amp_g,
+            pha_g,
+            n_fft,
+            hop_size,
+            win_size,
+            compress_factor=0.3,
+            center=True)
+
+        wav = wav / norm_factor
+
+        output = {
+            'wav_l2': wav,
+        }
+
+        return output
+
+
+class ZipEnhancer(nn.Module):
+
+    def __init__(self, h):
+        """
+        Initialize the ZipEnhancer module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        having num_tsconformers, former_name, former_conf, mask_decoder_type, ...
+        """
+        super(ZipEnhancer, self).__init__()
+        self.h = h
+
+        num_tsconformers = h.num_tsconformers
+        self.num_tscblocks = num_tsconformers
+        self.dense_encoder = DenseEncoder(h, in_channel=2)
+
+        self.TSConformer = Zipformer2DualPathEncoder(
+            output_downsampling_factor=1,
+            dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+            **h.former_conf)
+
+        self.mask_decoder = MappingDecoder(h, out_channel=h.model_num_spks)
+        self.phase_decoder = PhaseDecoder(h, out_channel=h.model_num_spks)
+
+    def forward(self, noisy_mag, noisy_pha):  # [B, F, T]
+        """
+        Forward pass of the ZipEnhancer module.
+
+        Args:
+        noisy_mag (Tensor): Noisy magnitude input tensor of shape [B, F, T].
+        noisy_pha (Tensor): Noisy phase input tensor of shape [B, F, T].
+
+        Returns:
+        Tuple: denoised magnitude, denoised phase, denoised complex representation,
+               (optional) predicted noise components, and other auxiliary information.
+        """
+        others = dict()
+
+        noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1)  # [B, 1, T, F]
+        noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1)  # [B, 1, T, F]
+        x = torch.cat((noisy_mag, noisy_pha), dim=1)  # [B, 2, T, F]
+        x = self.dense_encoder(x)
+
+        # [B, C, T, F]
+        x = self.TSConformer(x)
+
+        pred_mag = self.mask_decoder(x)
+        pred_pha = self.phase_decoder(x)
+        # b, c, t, f -> b, 1, t, f -> b, f, t, 1 -> b, f, t
+        denoised_mag = pred_mag[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
+                                                                 1).squeeze(-1)
+
+        # b, t, f
+        denoised_pha = pred_pha[:, 0, :, :].unsqueeze(1).permute(0, 3, 2,
+                                                                 1).squeeze(-1)
+        # b, t, f
+        denoised_com = torch.stack((denoised_mag * torch.cos(denoised_pha),
+                                    denoised_mag * torch.sin(denoised_pha)),
+                                   dim=-1)
+
+        return denoised_mag, denoised_pha, denoised_com, None, others
+
+
+class AttrDict(dict):
+
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+
+
+def mag_pha_stft(y,
+                 n_fft,
+                 hop_size,
+                 win_size,
+                 compress_factor=1.0,
+                 center=True):
+    hann_window = torch.hann_window(win_size, device=y.device)
+    stft_spec = torch.stft(
+        y,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center,
+        pad_mode='reflect',
+        normalized=False,
+        return_complex=True)
+    stft_spec = torch.view_as_real(stft_spec)
+    mag = torch.sqrt(stft_spec.pow(2).sum(-1) + (1e-9))
+    pha = torch.atan2(stft_spec[:, :, :, 1], stft_spec[:, :, :, 0] + (1e-5))
+    # Magnitude Compression
+    mag = torch.pow(mag, compress_factor)
+    com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1)
+
+    return mag, pha, com
+
+
+def mag_pha_istft(mag,
+                  pha,
+                  n_fft,
+                  hop_size,
+                  win_size,
+                  compress_factor=1.0,
+                  center=True):
+    # Magnitude Decompression
+    mag = torch.pow(mag, (1.0 / compress_factor))
+    com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha))
+    hann_window = torch.hann_window(win_size, device=com.device)
+
+    wav = torch.istft(
+        com,
+        n_fft,
+        hop_length=hop_size,
+        win_length=win_size,
+        window=hann_window,
+        center=center)
+    return wav
diff --git a/modelscope/models/audio/punc/__init__.py b/modelscope/models/audio/ans/zipenhancer_layers/__init__.py
similarity index 100%
rename from modelscope/models/audio/punc/__init__.py
rename to modelscope/models/audio/ans/zipenhancer_layers/__init__.py
diff --git a/modelscope/models/audio/ans/zipenhancer_layers/generator.py b/modelscope/models/audio/ans/zipenhancer_layers/generator.py
new file mode 100644
index 000000000..8332ba4d8
--- /dev/null
+++ b/modelscope/models/audio/ans/zipenhancer_layers/generator.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed and modified from MP-SENet,
+# public available at https://github.com/yxlu-0102/MP-SENet
+
+import random
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SubPixelConvTranspose2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=(1, 3),
+                 stride=(1, 2),
+                 padding=(0, 1)):
+        super(SubPixelConvTranspose2d, self).__init__()
+        self.upscale_width_factor = stride[1]
+        self.conv1 = nn.Conv2d(
+            in_channels,
+            out_channels * self.upscale_width_factor,
+            kernel_size=kernel_size,
+            padding=padding)  # only change the width
+
+    def forward(self, x):
+
+        b, c, t, f = x.size()
+        # Use conv1 for upsampling, followed by expansion only in the width dimension.
+        x = self.conv1(x)
+        # print(x.size())
+        # Note: Here we do not directly use PixelShuffle because we only intend to expand in the width dimension,
+        # whereas PixelShuffle operates simultaneously on both height and width, hence we manually adjust accordingly.
+        # b, 2c, t, f
+        # print(x.size())
+        x = x.view(b, c, self.upscale_width_factor, t,
+                   f).permute(0, 1, 3, 4, 2).contiguous()
+        # b, c, 2, t, f -> b, c, t, f, 2
+        x = x.view(b, c, t, f * self.upscale_width_factor)
+        # b, c, t, 2f = 202
+        # x = nn.functional.pad(x, (0, 1))
+        # b, c, t, 2f = 202
+
+        return x
+
+
+class DenseBlockV2(nn.Module):
+    """
+    A denseblock for ZipEnhancer
+    """
+
+    def __init__(self, h, kernel_size=(2, 3), depth=4):
+        super(DenseBlockV2, self).__init__()
+        self.h = h
+        self.depth = depth
+        self.dense_block = nn.ModuleList([])
+        for i in range(depth):
+            dil = 2**i
+            pad_length = kernel_size[0] + (dil - 1) * (kernel_size[0] - 1) - 1
+            dense_conv = nn.Sequential(
+                nn.ConstantPad2d((1, 1, pad_length, 0), value=0.),
+                nn.Conv2d(
+                    h.dense_channel * (i + 1),
+                    h.dense_channel,
+                    kernel_size,
+                    dilation=(dil, 1)),
+                # nn.Conv2d(h.dense_channel * (i + 1), h.dense_channel, kernel_size, dilation=(dil, 1),
+                #           padding=get_padding_2d(kernel_size, (dil, 1))),
+                nn.InstanceNorm2d(h.dense_channel, affine=True),
+                nn.PReLU(h.dense_channel))
+            self.dense_block.append(dense_conv)
+
+    def forward(self, x):
+        skip = x
+        # b, c, t, f
+        for i in range(self.depth):
+            _x = skip
+            x = self.dense_block[i](_x)
+            # print(x.size())
+            skip = torch.cat([x, skip], dim=1)
+        return x
+
+
+class DenseEncoder(nn.Module):
+
+    def __init__(self, h, in_channel):
+        """
+        Initialize the DenseEncoder module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        in_channel (int): Number of input channels. Example: mag + phase: 2 channels
+        """
+        super(DenseEncoder, self).__init__()
+        self.h = h
+        self.dense_conv_1 = nn.Sequential(
+            nn.Conv2d(in_channel, h.dense_channel, (1, 1)),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel))
+
+        self.dense_block = DenseBlockV2(h, depth=4)
+
+        encoder_pad_kersize = (0, 1)
+        # Here pad was originally (0, 0)，now change to (0, 1)
+        self.dense_conv_2 = nn.Sequential(
+            nn.Conv2d(
+                h.dense_channel,
+                h.dense_channel, (1, 3), (1, 2),
+                padding=encoder_pad_kersize),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel))
+
+    def forward(self, x):
+        """
+        Forward pass of the DenseEncoder module.
+
+        Args:
+        x (Tensor): Input tensor of shape [B, C=in_channel, T, F].
+
+        Returns:
+        Tensor: Output tensor after passing through the dense encoder. Maybe: [B, C=dense_channel, T, F // 2].
+        """
+        # print("x: {}".format(x.size()))
+        x = self.dense_conv_1(x)  # [b, 64, T, F]
+        if self.dense_block is not None:
+            x = self.dense_block(x)  # [b, 64, T, F]
+        x = self.dense_conv_2(x)  # [b, 64, T, F//2]
+        return x
+
+
+class BaseDecoder(nn.Module):
+
+    def __init__(self, h):
+        """
+        Initialize the BaseDecoder module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        including upsample_type, dense_block_type.
+        """
+        super(BaseDecoder, self).__init__()
+
+        self.upsample_module_class = SubPixelConvTranspose2d
+
+        # for both mag and phase decoder
+        self.dense_block = DenseBlockV2(h, depth=4)
+
+
+class MappingDecoder(BaseDecoder):
+
+    def __init__(self, h, out_channel=1):
+        """
+        Initialize the MappingDecoderV3 module.
+
+        Args:
+        h (object): Configuration object containing various hyperparameters and settings.
+        out_channel (int): Number of output channels. Default is 1. The number of output spearkers.
+        """
+        super(MappingDecoder, self).__init__(h)
+        decoder_final_kersize = (1, 2)
+
+        self.mask_conv = nn.Sequential(
+            self.upsample_module_class(h.dense_channel, h.dense_channel,
+                                       (1, 3), (1, 2)),
+            # nn.Conv2d(h.dense_channel, out_channel, (1, 1)),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel),
+            nn.Conv2d(h.dense_channel, out_channel, decoder_final_kersize))
+        # Upsample at F dimension
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """
+        Forward pass of the MappingDecoderV3 module.
+
+        Args:
+        x (Tensor): Input tensor. [B, C, T, F]
+
+        Returns:
+        Tensor: Output tensor after passing through the decoder. [B, Num_Spks, T, F]
+        """
+        if self.dense_block is not None:
+            x = self.dense_block(x)
+        x = self.mask_conv(x)
+        x = self.relu(x)
+        # b, c=1, t, f
+        return x
+
+
+class PhaseDecoder(BaseDecoder):
+
+    def __init__(self, h, out_channel=1):
+        super(PhaseDecoder, self).__init__(h)
+
+        # now change to (1, 2), previous (1, 1)
+        decoder_final_kersize = (1, 2)
+
+        self.phase_conv = nn.Sequential(
+            self.upsample_module_class(h.dense_channel, h.dense_channel,
+                                       (1, 3), (1, 2)),
+            nn.InstanceNorm2d(h.dense_channel, affine=True),
+            nn.PReLU(h.dense_channel))
+        self.phase_conv_r = nn.Conv2d(h.dense_channel, out_channel,
+                                      decoder_final_kersize)
+        self.phase_conv_i = nn.Conv2d(h.dense_channel, out_channel,
+                                      decoder_final_kersize)
+
+    def forward(self, x):
+        if self.dense_block is not None:
+            x = self.dense_block(x)
+        x = self.phase_conv(x)
+        x_r = self.phase_conv_r(x)
+        x_i = self.phase_conv_i(x)
+        x = torch.atan2(x_i, x_r)
+        return x
diff --git a/modelscope/models/audio/ans/zipenhancer_layers/scaling.py b/modelscope/models/audio/ans/zipenhancer_layers/scaling.py
new file mode 100644
index 000000000..06dfc2bd6
--- /dev/null
+++ b/modelscope/models/audio/ans/zipenhancer_layers/scaling.py
@@ -0,0 +1,1055 @@
+# Copyright    2022-2023  Xiaomi Corp.        (authors: Daniel Povey)
+# Copyright (c) 2024 Alibaba, Inc. and its affiliates.
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import math
+import random
+from typing import Optional, Tuple, Union
+
+# import k2
+import torch
+import torch.nn as nn
+from torch import Tensor
+from torch.cuda.amp import custom_bwd, custom_fwd
+
+
+def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor:
+    max_value = torch.max(x, y)
+    diff = torch.abs(x - y)
+    return max_value + torch.log1p(torch.exp(-diff))
+
+
+# RuntimeError: Exporting the operator logaddexp to ONNX opset version
+# 14 is not supported. Please feel free to request support or submit
+# a pull request on PyTorch GitHub.
+#
+# The following function is to solve the above error when exporting
+# models to ONNX via torch.jit.trace()
+def logaddexp(x: Tensor, y: Tensor) -> Tensor:
+    # Caution(fangjun): Put torch.jit.is_scripting() before
+    # torch.onnx.is_in_onnx_export();
+    # otherwise, it will cause errors for torch.jit.script().
+    #
+    # torch.logaddexp() works for both torch.jit.script() and
+    # torch.jit.trace() but it causes errors for ONNX export.
+    #
+    if torch.jit.is_scripting():
+        # Note: We cannot use torch.jit.is_tracing() here as it also
+        # matches torch.onnx.export().
+        return torch.logaddexp(x, y)
+    elif torch.onnx.is_in_onnx_export():
+        return logaddexp_onnx(x, y)
+    else:
+        # for torch.jit.trace()
+        return torch.logaddexp(x, y)
+
+
+class PiecewiseLinear(object):
+    """
+    Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with
+    the x values in order.  x values <[initial x] or >[final x] are map to [initial y], [final y]
+    respectively.
+    """
+
+    def __init__(self, *args):
+        assert len(args) >= 1, len(args)
+        if len(args) == 1 and isinstance(args[0], PiecewiseLinear):
+            self.pairs = list(args[0].pairs)
+        else:
+            self.pairs = [(float(x), float(y)) for x, y in args]
+        for x, y in self.pairs:
+            assert isinstance(x, (float, int)), type(x)
+            assert isinstance(y, (float, int)), type(y)
+
+        for i in range(len(self.pairs) - 1):
+            assert self.pairs[i + 1][0] > self.pairs[i][0], (
+                i,
+                self.pairs[i],
+                self.pairs[i + 1],
+            )
+
+    def __str__(self):
+        # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))'
+        return f'PiecewiseLinear({str(self.pairs)[1:-1]})'
+
+    def __call__(self, x):
+        if x <= self.pairs[0][0]:
+            return self.pairs[0][1]
+        elif x >= self.pairs[-1][0]:
+            return self.pairs[-1][1]
+        else:
+            cur_x, cur_y = self.pairs[0]
+            for i in range(1, len(self.pairs)):
+                next_x, next_y = self.pairs[i]
+                if x >= cur_x and x <= next_x:
+                    return cur_y + (next_y - cur_y) * (x - cur_x) / (
+                        next_x - cur_x)
+                cur_x, cur_y = next_x, next_y
+            assert False
+
+    def __mul__(self, alpha):
+        return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs])
+
+    def __add__(self, x):
+        if isinstance(x, (float, int)):
+            return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs])
+        s, x = self.get_common_basis(x)
+        return PiecewiseLinear(*[(sp[0], sp[1] + xp[1])
+                                 for sp, xp in zip(s.pairs, x.pairs)])
+
+    def max(self, x):
+        if isinstance(x, (float, int)):
+            x = PiecewiseLinear((0, x))
+        s, x = self.get_common_basis(x, include_crossings=True)
+        return PiecewiseLinear(*[(sp[0], max(sp[1], xp[1]))
+                                 for sp, xp in zip(s.pairs, x.pairs)])
+
+    def min(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            x = PiecewiseLinear((0, x))
+        s, x = self.get_common_basis(x, include_crossings=True)
+        return PiecewiseLinear(*[(sp[0], min(sp[1], xp[1]))
+                                 for sp, xp in zip(s.pairs, x.pairs)])
+
+    def __eq__(self, other):
+        return self.pairs == other.pairs
+
+    def get_common_basis(self,
+                         p: 'PiecewiseLinear',
+                         include_crossings: bool = False):
+        """
+        Returns (self_mod, p_mod) which are equivalent piecewise linear
+        functions to self and p, but with the same x values.
+
+          p: the other piecewise linear function
+          include_crossings: if true, include in the x values positions
+              where the functions indicate by this and p cross.
+        """
+        assert isinstance(p, PiecewiseLinear), type(p)
+
+        # get sorted x-values without repetition.
+        x_vals = sorted(
+            set([x for x, _ in self.pairs] + [x for x, _ in p.pairs]))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+
+        if include_crossings:
+            extra_x_vals = []
+            for i in range(len(x_vals) - 1):
+                _compare_results1 = (y_vals1[i] > y_vals2[i])
+                _compare_results2 = (y_vals1[i + 1] > y_vals2[i + 1])
+                if _compare_results1 != _compare_results2:
+                    # if ((y_vals1[i] > y_vals2[i]) !=
+                    #     (y_vals1[i + 1] > y_vals2[i + 1])):
+                    # if the two lines in this subsegment potentially cross each other.
+                    diff_cur = abs(y_vals1[i] - y_vals2[i])
+                    diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1])
+                    # `pos`, between 0 and 1, gives the relative x position,
+                    # with 0 being x_vals[i] and 1 being x_vals[i+1].
+                    pos = diff_cur / (diff_cur + diff_next)
+                    extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i])
+                    extra_x_vals.append(extra_x_val)
+            if len(extra_x_vals) > 0:
+                x_vals = sorted(set(x_vals + extra_x_vals))
+        y_vals1 = [self(x) for x in x_vals]
+        y_vals2 = [p(x) for x in x_vals]
+        return (
+            PiecewiseLinear(*zip(x_vals, y_vals1)),
+            PiecewiseLinear(*zip(x_vals, y_vals2)),
+        )
+
+
+class ScheduledFloat(torch.nn.Module):
+    """
+    This object is a torch.nn.Module only because we want it to show up in [top_level module].modules();
+    it does not have a working forward() function.  You are supposed to cast it to float, as
+    in, float(parent_module.whatever), and use it as something like a dropout prob.
+
+    It is a floating point value whose value changes depending on the batch count of the
+    training loop.  It is a piecewise linear function where you specify the (x,y) pairs
+    in sorted order on x; x corresponds to the batch index.  For batch-index values before the
+    first x or after the last x, we just use the first or last y value.
+
+    Example:
+       self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0)
+
+    `default` is used when self.batch_count is not set or not in training mode or in
+     torch.jit scripting mode.
+    """
+
+    def __init__(self, *args, default: float = 0.0):
+        super().__init__()
+        # self.batch_count and self.name will be written to in the training loop.
+        self.batch_count = None
+        self.name = None
+        self.default = default
+        self.schedule = PiecewiseLinear(*args)
+
+    def extra_repr(self) -> str:
+        return (
+            f'batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}'
+        )
+
+    def __float__(self):
+        batch_count = self.batch_count
+        if (batch_count is None or not self.training
+                or torch.jit.is_scripting() or torch.jit.is_tracing()):
+            return float(self.default)
+        else:
+            ans = self.schedule(self.batch_count)
+            if random.random() < 0.0002:
+                logging.info(
+                    f'ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}'
+                )
+            return ans
+
+    def __add__(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            return ScheduledFloat(self.schedule + x, default=self.default)
+        else:
+            return ScheduledFloat(
+                self.schedule + x.schedule, default=self.default + x.default)
+
+    def max(self, x):
+        if isinstance(x, float) or isinstance(x, int):
+            return ScheduledFloat(self.schedule.max(x), default=self.default)
+        else:
+            return ScheduledFloat(
+                self.schedule.max(x.schedule),
+                default=max(self.default, x.default))
+
+
+FloatLike = Union[float, ScheduledFloat]
+
+
+class SoftmaxFunction(torch.autograd.Function):
+    """
+    Tries to handle half-precision derivatives in a randomized way that should
+    be more accurate for training than the default behavior.
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor, dim: int):
+        ans = x.softmax(dim=dim)
+        # if x dtype is float16, x.softmax() returns a float32 because
+        # (presumably) that op does not support float16, and autocast
+        # is enabled.
+        if torch.is_autocast_enabled():
+            ans = ans.to(torch.float16)
+        ctx.save_for_backward(ans)
+        ctx.x_dtype = x.dtype
+        ctx.dim = dim
+        return ans
+
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        (ans, ) = ctx.saved_tensors
+        with torch.cuda.amp.autocast(enabled=False):
+            ans_grad = ans_grad.to(torch.float32)
+            ans = ans.to(torch.float32)
+            x_grad = ans_grad * ans
+            x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True)
+            return x_grad, None
+
+
+def inplace_softmax(tensor, dim):
+    # Subtract the maximum value from each Tensor to prevent overflow.
+    max_vals, _ = tensor.max(dim=dim, keepdim=True)
+    tensor.sub_(max_vals)
+
+    # # calculate logsumexp
+    # log_sum_exp = torch.logsumexp(tensor, dim=dim, keepdim=True)
+    #
+    # # minus logsumexp
+    # tensor.sub_(log_sum_exp)
+    #
+    # # Compute the exponential of each element, and store the results in-place.
+    # tensor.exp_()
+
+    # Compute the exponential of each element, and store the results in-place.
+    tensor.exp_()
+
+    # Compute the sum along the specified dimension, and store the result in-place.
+    sum_exp = tensor.sum(dim=dim, keepdim=True)
+
+    # Divide each element by the sum along that dimension, and store the result in-place.
+    tensor.div_(sum_exp)
+    # tensor.add_(1e-8)
+
+    return tensor
+
+
+def softmax(x: Tensor, dim: int):
+    if not x.requires_grad or torch.jit.is_scripting() or torch.jit.is_tracing(
+    ):
+        # return x.softmax(dim=dim)
+        # inplace operator
+        return inplace_softmax(x, dim)
+
+    return SoftmaxFunction.apply(x, dim)
+
+
+class BiasNormFunction(torch.autograd.Function):
+    # This computes:
+    #   scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp()
+    #   return x * scales
+    # (after unsqueezing the bias), but it does it in a memory-efficient way so that
+    # it can just store the returned value (chances are, this will also be needed for
+    # some other reason, related to the next operation, so we can save memory).
+    @staticmethod
+    def forward(
+        ctx,
+        x: Tensor,
+        bias: Tensor,
+        log_scale: Tensor,
+        channel_dim: int,
+        store_output_for_backprop: bool,
+    ) -> Tensor:
+        assert bias.ndim == 1
+        if channel_dim < 0:
+            channel_dim = channel_dim + x.ndim
+        ctx.store_output_for_backprop = store_output_for_backprop
+        ctx.channel_dim = channel_dim
+        for _ in range(channel_dim + 1, x.ndim):
+            bias = bias.unsqueeze(-1)
+        _x_bias_square = torch.mean(
+            (x - bias)**2, dim=channel_dim, keepdim=True)
+        scales = (_x_bias_square**-0.5) * log_scale.exp()
+        ans = x * scales
+        ctx.save_for_backward(
+            ans.detach() if store_output_for_backprop else x,
+            scales.detach(),
+            bias.detach(),
+            log_scale.detach(),
+        )
+        return ans
+
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor) -> Tensor:
+        ans_or_x, scales, bias, log_scale = ctx.saved_tensors
+        if ctx.store_output_for_backprop:
+            x = ans_or_x / scales
+        else:
+            x = ans_or_x
+        x = x.detach()
+        x.requires_grad = True
+        bias.requires_grad = True
+        log_scale.requires_grad = True
+        with torch.enable_grad():
+            # recompute scales from x, bias and log_scale.
+            _x_bias_square = torch.mean(
+                (x - bias)**2, dim=ctx.channel_dim, keepdim=True)
+            scales = (_x_bias_square**-0.5) * log_scale.exp()
+            ans = x * scales
+            ans.backward(gradient=ans_grad)
+        return x.grad, bias.grad.flatten(), log_scale.grad, None, None
+
+
+class BiasNorm(torch.nn.Module):
+    """
+    This is intended to be a simpler, and hopefully cheaper, replacement for
+    LayerNorm.  The observation this is based on, is that Transformer-type
+    networks, especially with pre-norm, sometimes seem to set one of the
+    feature dimensions to a large constant value (e.g. 50), which "defeats"
+    the LayerNorm because the output magnitude is then not strongly dependent
+    on the other (useful) features.  Presumably the weight and bias of the
+    LayerNorm are required to allow it to do this.
+
+    Instead, we give the BiasNorm a trainable bias that it can use when
+    computing the scale for normalization.  We also give it a (scalar)
+    trainable scale on the output.
+
+
+    Args:
+       num_channels: the number of channels, e.g. 512.
+       channel_dim: the axis/dimension corresponding to the channel,
+         interpreted as an offset from the input's ndim if negative.
+         This is NOT the num_channels; it should typically be one of
+         {-2, -1, 0, 1, 2, 3}.
+      log_scale: the initial log-scale that we multiply the output by; this
+         is learnable.
+      log_scale_min: FloatLike, minimum allowed value of log_scale
+      log_scale_max: FloatLike, maximum allowed value of log_scale
+      store_output_for_backprop: only possibly affects memory use; recommend
+         to set to True if you think the output of this module is more likely
+         than the input of this module to be required to be stored for the
+         backprop.
+    """
+
+    def __init__(
+        self,
+        num_channels: int,
+        channel_dim: int = -1,  # CAUTION: see documentation.
+        log_scale: float = 1.0,
+        log_scale_min: float = -1.5,
+        log_scale_max: float = 1.5,
+        store_output_for_backprop: bool = False,
+    ) -> None:
+        super(BiasNorm, self).__init__()
+        self.num_channels = num_channels
+        self.channel_dim = channel_dim
+        self.log_scale = nn.Parameter(torch.tensor(log_scale))
+        self.bias = nn.Parameter(
+            torch.empty(num_channels).normal_(mean=0, std=1e-4))
+
+        self.log_scale_min = log_scale_min
+        self.log_scale_max = log_scale_max
+
+        self.store_output_for_backprop = store_output_for_backprop
+
+    def forward(self, x: Tensor) -> Tensor:
+        assert x.shape[self.channel_dim] == self.num_channels
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            channel_dim = self.channel_dim
+            if channel_dim < 0:
+                channel_dim += x.ndim
+            bias = self.bias
+            for _ in range(channel_dim + 1, x.ndim):
+                bias = bias.unsqueeze(-1)
+            _x_bias_square = torch.mean(
+                (x - bias)**2, dim=channel_dim, keepdim=True)
+            scales = (_x_bias_square**-0.5) * self.log_scale.exp()
+            return x * scales
+
+        log_scale = limit_param_value(
+            self.log_scale,
+            min=float(self.log_scale_min),
+            max=float(self.log_scale_max),
+            training=self.training,
+        )
+
+        return BiasNormFunction.apply(x, self.bias, log_scale,
+                                      self.channel_dim,
+                                      self.store_output_for_backprop)
+
+
+def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear:
+    """
+    Behaves like a constructor of a modified version of nn.Linear
+    that gives an easy way to set the default initial parameter scale.
+
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+    ans = nn.Linear(*args, **kwargs)
+    with torch.no_grad():
+        ans.weight[:] *= initial_scale
+        if ans.bias is not None:
+            torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale,
+                                   0.1 * initial_scale)
+    return ans
+
+
+class ChunkCausalDepthwiseConv1d(torch.nn.Module):
+    """
+    Behaves like a depthwise 1d convolution, except that it is causal in
+    a chunkwise way, as if we had a block-triangular attention mask.
+    The chunk size is provided at test time (it should probably be
+    kept in sync with the attention mask).
+
+    This has a little more than twice the parameters of a conventional
+    depthwise conv1d module: we implement it by having one
+    depthwise convolution, of half the width, that is causal (via
+    right-padding); and one depthwise convolution that is applied only
+    within chunks, that we multiply by a scaling factor which depends
+    on the position within the chunk.
+
+    Args:
+        Accepts the standard args and kwargs that nn.Linear accepts
+        e.g. in_features, out_features, bias=False.
+
+        initial_scale: you can override this if you want to increase
+           or decrease the initial magnitude of the module's output
+           (affects the initialization of weight_scale and bias_scale).
+           Another option, if you want to do something like this, is
+           to re-initialize the parameters.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+        initial_scale: float = 1.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        assert kernel_size % 2 == 1
+
+        half_kernel_size = (kernel_size + 1) // 2
+        # will pad manually, on one side.
+        self.causal_conv = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=half_kernel_size,
+            padding=0,
+            bias=True,
+        )
+
+        self.chunkwise_conv = nn.Conv1d(
+            in_channels=channels,
+            out_channels=channels,
+            groups=channels,
+            kernel_size=kernel_size,
+            padding=kernel_size // 2,
+            bias=bias,
+        )
+
+        # first row is correction factors added to the scale near the left edge of the chunk,
+        # second row is correction factors added to the scale near the right edge of the chunk,
+        # both of these are added to a default scale of 1.0.
+        self.chunkwise_conv_scale = nn.Parameter(
+            torch.zeros(2, channels, kernel_size))
+        self.kernel_size = kernel_size
+
+        with torch.no_grad():
+            self.causal_conv.weight[:] *= initial_scale
+            self.chunkwise_conv.weight[:] *= initial_scale
+            if bias:
+                torch.nn.init.uniform_(self.causal_conv.bias,
+                                       -0.1 * initial_scale,
+                                       0.1 * initial_scale)
+
+    def forward(self, x: Tensor, chunk_size: int = -1) -> Tensor:
+        """Forward function.
+
+        Args:
+               x: a Tensor of shape (batch_size, channels, seq_len)
+        chunk_size: the chunk size, in frames; does not have to divide seq_len exactly.
+        """
+        (batch_size, num_channels, seq_len) = x.shape
+
+        # half_kernel_size = self.kernel_size + 1 // 2
+        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
+        # in the causal conv.  It's the amount by which we must pad on the left,
+        # to make the convolution causal.
+        left_pad = self.kernel_size // 2
+
+        if chunk_size < 0 or chunk_size > seq_len:
+            chunk_size = seq_len
+        right_pad = -seq_len % chunk_size
+
+        x = torch.nn.functional.pad(x, (left_pad, right_pad))
+
+        x_causal = self.causal_conv(x[..., :left_pad + seq_len])
+        assert x_causal.shape == (batch_size, num_channels, seq_len)
+
+        x_chunk = x[..., left_pad:]
+        num_chunks = x_chunk.shape[2] // chunk_size
+        x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks,
+                                  chunk_size)
+        x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(batch_size * num_chunks,
+                                                      num_channels, chunk_size)
+        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
+
+        chunk_scale = self._get_chunk_scale(chunk_size)
+
+        x_chunk = x_chunk * chunk_scale
+        x_chunk = x_chunk.reshape(batch_size, num_chunks, num_channels,
+                                  chunk_size).permute(0, 2, 1, 3)
+        x_chunk = x_chunk.reshape(batch_size, num_channels,
+                                  num_chunks * chunk_size)[..., :seq_len]
+
+        return x_chunk + x_causal
+
+    def _get_chunk_scale(self, chunk_size: int):
+        """Returns tensor of shape (num_channels, chunk_size) that will be used to
+        scale the output of self.chunkwise_conv."""
+        left_edge = self.chunkwise_conv_scale[0]
+        right_edge = self.chunkwise_conv_scale[1]
+        if chunk_size < self.kernel_size:
+            left_edge = left_edge[:, :chunk_size]
+            right_edge = right_edge[:, -chunk_size:]
+        else:
+            t = chunk_size - self.kernel_size
+            channels = left_edge.shape[0]
+            pad = torch.zeros(
+                channels, t, device=left_edge.device, dtype=left_edge.dtype)
+            left_edge = torch.cat((left_edge, pad), dim=-1)
+            right_edge = torch.cat((pad, right_edge), dim=-1)
+        return 1.0 + (left_edge + right_edge)
+
+    def streaming_forward(
+        self,
+        x: Tensor,
+        cache: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """Streaming Forward function.
+
+        Args:
+            x: a Tensor of shape (batch_size, channels, seq_len)
+            cache: cached left context of shape (batch_size, channels, left_pad)
+        """
+        (batch_size, num_channels, seq_len) = x.shape
+
+        # left_pad is half_kernel_size - 1 where half_kernel_size is the size used
+        # in the causal conv.  It's the amount by which we must pad on the left,
+        # to make the convolution causal.
+        left_pad = self.kernel_size // 2
+
+        # Pad cache
+        assert cache.shape[-1] == left_pad, (cache.shape[-1], left_pad)
+        x = torch.cat([cache, x], dim=2)
+        # Update cache
+        cache = x[..., -left_pad:]
+
+        x_causal = self.causal_conv(x)
+        assert x_causal.shape == (batch_size, num_channels, seq_len)
+
+        x_chunk = x[..., left_pad:]
+        x_chunk = self.chunkwise_conv(x_chunk)  # does not change shape
+
+        chunk_scale = self._get_chunk_scale(chunk_size=seq_len)
+        x_chunk = x_chunk * chunk_scale
+
+        return x_chunk + x_causal, cache
+
+
+def penalize_abs_values_gt(x: Tensor,
+                           limit: float,
+                           penalty: float,
+                           name: str = None) -> Tensor:
+    """
+    Returns x unmodified, but in backprop will put a penalty for the excess of
+    the absolute values of elements of x over the limit "limit".  E.g. if
+    limit == 10.0, then if x has any values over 10 it will get a penalty.
+
+    Caution: the value of this penalty will be affected by grad scaling used
+    in automatic mixed precision training.  For this reasons we use this,
+    it shouldn't really matter, or may even be helpful; we just use this
+    to disallow really implausible values of scores to be given to softmax.
+
+    The name is for randomly printed debug info.
+    """
+    x_sign = x.sign()
+    over_limit = (x.abs() - limit) > 0
+    # The following is a memory efficient way to penalize the absolute values of
+    # x that's over the limit.  (The memory efficiency comes when you think
+    # about which items torch needs to cache for the autograd, and which ones it
+    # can throw away).  The numerical value of aux_loss as computed here will
+    # actually be larger than it should be, by limit * over_limit.sum(), but it
+    # has the same derivative as the real aux_loss which is penalty * (x.abs() -
+    # limit).relu().
+    aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x)
+    # note: we don't do sum() here on aux)_loss, but it's as if we had done
+    # sum() due to how with_loss() works.
+    x = with_loss(x, aux_loss, name)
+    # you must use x for something, or this will be ineffective.
+    return x
+
+
+class WithLoss(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x: Tensor, y: Tensor, name: str):
+        ctx.y_shape = y.shape
+        if random.random() < 0.002 and name is not None:
+            loss_sum = y.sum().item()
+            logging.info(f'WithLoss: name={name}, loss-sum={loss_sum:.3e}')
+        return x
+
+    @staticmethod
+    def backward(ctx, ans_grad: Tensor):
+        return (
+            ans_grad,
+            torch.ones(
+                ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device),
+            None,
+        )
+
+
+def with_loss(x, y, name):
+    # returns x but adds y.sum() to the loss function.
+    return WithLoss.apply(x, y, name)
+
+
+class LimitParamValue(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x: Tensor, min: float, max: float):
+        ctx.save_for_backward(x)
+        assert max >= min
+        ctx.min = min
+        ctx.max = max
+        return x
+
+    @staticmethod
+    def backward(ctx, x_grad: Tensor):
+        (x, ) = ctx.saved_tensors
+        # where x < ctx.min, ensure all grads are negative (this will tend to make
+        # x more positive).
+        x_grad = x_grad * torch.where(
+            torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0)
+        # where x > ctx.max, ensure all grads are positive (this will tend to make
+        # x more negative).
+        x_grad *= torch.where(
+            torch.logical_and(x_grad < 0, x > ctx.max), -1.0, 1.0)
+        return x_grad, None, None
+
+
+def limit_param_value(x: Tensor,
+                      min: float,
+                      max: float,
+                      prob: float = 0.6,
+                      training: bool = True):
+    # You apply this to (typically) an nn.Parameter during training to ensure that its
+    # (elements mostly) stays within a supplied range.  This is done by modifying the
+    # gradients in backprop.
+    # It's not necessary to do this on every batch: do it only some of the time,
+    # to save a little time.
+    if training and random.random() < prob:
+        return LimitParamValue.apply(x, min, max)
+    else:
+        return x
+
+
+def _no_op(x: Tensor) -> Tensor:
+    if torch.jit.is_scripting() or torch.jit.is_tracing():
+        return x
+    else:
+        # a no-op function that will have a node in the autograd graph,
+        # to avoid certain bugs relating to backward hooks
+        return x.chunk(1, dim=-1)[0]
+
+
+class Identity(torch.nn.Module):
+
+    def __init__(self):
+        super(Identity, self).__init__()
+
+    def forward(self, x):
+        return _no_op(x)
+
+
+# Dropout2 is just like normal dropout, except it supports schedules on the dropout rates.
+class Dropout2(nn.Module):
+
+    def __init__(self, p: FloatLike):
+        super().__init__()
+        self.p = p
+
+    def forward(self, x: Tensor) -> Tensor:
+        return torch.nn.functional.dropout(
+            x, p=float(self.p), training=self.training)
+
+
+class SwooshLFunction(torch.autograd.Function):
+    """
+    swoosh_l(x) =  log(1 + exp(x-4)) - 0.08*x - 0.035
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+
+        coeff = -0.08
+
+        with (torch.cuda.amp.autocast(enabled=False)):
+            with torch.enable_grad():
+                x = x.detach()
+                x.requires_grad = True
+                y = torch.logaddexp(zero, x - 4.0) + coeff * x - 0.035
+
+                if not requires_grad:
+                    return y
+
+                y.backward(gradient=torch.ones_like(y))
+
+                grad = x.grad
+                floor = coeff
+                ceil = 1.0 + coeff + 0.005
+                _diff = (grad - floor) * (255.0 / (ceil - floor))
+                d_scaled = _diff + torch.rand_like(grad)
+                if __name__ == '__main__':
+                    # for self-testing only.
+                    assert d_scaled.min() >= 0.0
+                    assert d_scaled.max() < 256.0
+
+                d_int = d_scaled.to(torch.uint8)
+                ctx.save_for_backward(d_int)
+                if x.dtype == torch.float16 or torch.is_autocast_enabled():
+                    y = y.to(torch.float16)
+                return y
+
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d, ) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+
+        coeff = -0.08
+        floor = coeff
+        ceil = 1.0 + coeff + 0.005
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+
+
+class SwooshL(torch.nn.Module):
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-L activation."""
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+            return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035
+        # if not x.requires_grad:
+        #     return k2.swoosh_l_forward(x)
+        # else:
+        #     return k2.swoosh_l(x)
+        return SwooshLFunction.apply(x)
+
+
+class SwooshLOnnx(torch.nn.Module):
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-L activation."""
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035
+
+
+class SwooshRFunction(torch.autograd.Function):
+    """
+     swoosh_r(x) =  log(1 + exp(x-1)) - 0.08*x - 0.313261687
+
+    derivatives are between -0.08 and 0.92.
+    """
+
+    @staticmethod
+    def forward(ctx, x: Tensor) -> Tensor:
+        requires_grad = x.requires_grad
+
+        if x.dtype == torch.float16:
+            x = x.to(torch.float32)
+
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+
+        with torch.cuda.amp.autocast(enabled=False):
+            with torch.enable_grad():
+                x = x.detach()
+                x.requires_grad = True
+                y = torch.logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
+
+                if not requires_grad:
+                    return y
+                y.backward(gradient=torch.ones_like(y))
+
+                grad = x.grad
+                floor = -0.08
+                ceil = 0.925
+
+                _diff = (grad - floor) * (255.0 / (ceil - floor))
+                d_scaled = _diff + torch.rand_like(grad)
+                if __name__ == '__main__':
+                    # for self-testing only.
+                    assert d_scaled.min() >= 0.0
+                    assert d_scaled.max() < 256.0
+
+                d_int = d_scaled.to(torch.uint8)
+                ctx.save_for_backward(d_int)
+                if x.dtype == torch.float16 or torch.is_autocast_enabled():
+                    y = y.to(torch.float16)
+                return y
+
+    @staticmethod
+    def backward(ctx, y_grad: Tensor) -> Tensor:
+        (d, ) = ctx.saved_tensors
+        # the same constants as used in forward pass.
+        floor = -0.08
+        ceil = 0.925
+        d = d * ((ceil - floor) / 255.0) + floor
+        return y_grad * d
+
+
+class SwooshR(torch.nn.Module):
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-R activation."""
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+            return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687
+        # if not x.requires_grad:
+        #     return k2.swoosh_r_forward(x)
+        # else:
+        #     return k2.swoosh_r(x)
+        return SwooshRFunction.apply(x)
+
+
+class SwooshROnnx(torch.nn.Module):
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Return Swoosh-R activation."""
+        zero = torch.tensor(0.0, dtype=x.dtype, device=x.device)
+        return logaddexp_onnx(zero, x - 1.0) - 0.08 * x - 0.313261687
+
+
+# simple version of SwooshL that does not redefine the backprop, used in
+# ActivationDropoutAndLinearFunction.
+def SwooshLForward(x: Tensor):
+    x_offset = x - 4.0
+    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
+    log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum)
+    return log_sum - 0.08 * x - 0.035
+
+
+def SwooshLForwardAndDeriv(x: Tensor):
+    """
+    https://k2-fsa.github.io/k2/python_api/api.html#swoosh-l-forward-and-deriv
+    :param x:
+    :return:
+    """
+    x_offset = x - 4.0
+    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
+    log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum)
+
+    deriv = 0.92 - 1 / (1 + x_offset.exp())
+
+    return log_sum - 0.08 * x - 0.035, deriv
+
+
+# simple version of SwooshR that does not redefine the backprop, used in
+# ActivationDropoutAndLinearFunction.
+def SwooshRForward(x: Tensor):
+    x_offset = x - 1.0
+    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
+    log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum)
+    return log_sum - 0.08 * x - 0.313261687
+
+
+def SwooshRForwardAndDeriv(x: Tensor):
+    """
+    https://k2-fsa.github.io/k2/python_api/api.html#swoosh-r-forward-and-deriv
+    :param x:
+    :return:
+    """
+    x_offset = x - 1.0
+    log_sum = (1.0 + x_offset.exp()).log().to(x.dtype)
+    log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum)
+
+    deriv = 0.92 - 1 / (1 + x_offset.exp())
+
+    return log_sum - 0.08 * x - 0.313261687, deriv
+
+
+class ActivationDropoutAndLinear(torch.nn.Module):
+    """
+     This merges an activation function followed by dropout and then a nn.Linear module;
+     it does so in a memory efficient way so that it only stores the input to the whole
+     module.  If activation == SwooshL and dropout_shared_dim != None, this will be
+     equivalent to:
+       nn.Sequential(SwooshL(),
+                     Dropout3(dropout_p, shared_dim=dropout_shared_dim),
+                     ScaledLinear(in_channels, out_channels, bias=bias,
+                                  initial_scale=initial_scale))
+    If dropout_shared_dim is None, the dropout would be equivalent to
+    Dropout2(dropout_p).  Note: Dropout3 will be more memory efficient as the dropout
+    mask is smaller.
+
+     Args:
+        in_channels: number of input channels, e.g. 256
+        out_channels: number of output channels, e.g. 256
+        bias: if true, have a bias
+        activation: the activation function, for now just support SwooshL.
+        dropout_p: the dropout probability or schedule (happens after nonlinearity).
+        dropout_shared_dim: the dimension, if any, across which the dropout mask is
+             shared (e.g. the time dimension).  If None, this may be less memory
+             efficient if there are modules before this one that cache the input
+             for their backprop (e.g. Balancer or Whiten).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        bias: bool = True,
+        activation: str = 'SwooshL',
+        dropout_p: FloatLike = 0.0,
+        dropout_shared_dim: Optional[int] = -1,
+        initial_scale: float = 1.0,
+    ):
+        super().__init__()
+        # create a temporary module of nn.Linear that we'll steal the
+        # weights and bias from
+        linear_module = ScaledLinear(
+            in_channels, out_channels, bias=bias, initial_scale=initial_scale)
+
+        self.weight = linear_module.weight
+        # register_parameter properly handles making it a parameter when l.bias
+        # is None. I think there is some reason for doing it this way rather
+        # than just setting it to None but I don't know what it is, maybe
+        # something to do with exporting the module..
+        self.register_parameter('bias', linear_module.bias)
+
+        self.activation = activation
+        self.dropout_p = dropout_p
+        self.dropout_shared_dim = dropout_shared_dim
+
+    def forward(self, x: Tensor):
+        # if torch.jit.is_scripting() or torch.jit.is_tracing():
+        if torch.jit.is_scripting() or torch.jit.is_tracing() or (
+                not self.training):
+            if self.activation == 'SwooshL':
+                x = SwooshLForward(x)
+                # x = k2.swoosh_l_forward(x)
+            elif self.activation == 'SwooshR':
+                x = SwooshRForward(x)
+                # x = k2.swoosh_r_forward(x)
+            else:
+                assert False, self.activation
+            return torch.nn.functional.linear(x, self.weight, self.bias)
+
+        # print(f"dropout_p:{float(self.dropout_p)}")
+        # print(f"dropout_shared_dim:{self.dropout_shared_dim}")
+        # return ActivationDropoutAndLinearFunction.apply(
+        #     x,
+        #     self.weight,
+        #     self.bias,
+        #     self.activation,
+        #     float(self.dropout_p),
+        #     self.dropout_shared_dim,
+        # )
+
+
+def convert_num_channels(x: Tensor, num_channels: int) -> Tensor:
+    """
+
+    :param x: (b, c, t, f)
+    :param num_channels:
+    :return: x: (b, num_channels, t, f)
+    """
+    if num_channels <= x.shape[1]:
+        return x[:, :num_channels, :, :]
+    else:
+        shape = list(x.shape)
+        shape[1] = num_channels - shape[1]
+        zeros = torch.zeros(shape, dtype=x.dtype, device=x.device)
+        return torch.cat((x, zeros), dim=1)
+
+
+if __name__ == '__main__':
+    logging.getLogger().setLevel(logging.INFO)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
diff --git a/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py b/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py
new file mode 100644
index 000000000..32bf7cb4d
--- /dev/null
+++ b/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py
@@ -0,0 +1,501 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import copy
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor, nn
+
+from .scaling import FloatLike, ScheduledFloat, convert_num_channels
+from .zipformer import (BypassModule, CompactRelPositionalEncoding,
+                        SimpleDownsample, SimpleUpsample,
+                        Zipformer2EncoderLayer)
+
+
+class DualPathZipformer2Encoder(nn.Module):
+    r"""DualPathZipformer2Encoder is a stack of N encoder layers
+    it has two kinds of EncoderLayer including F_Zipformer2EncoderLayer and T_Zipformer2EncoderLayer
+    the features are modeling with the shape of
+    [B, C, T, F] -> [F, T * B, C] -> -> [B, C, T, F] -> [T, F * B, C] -> [B, C, T, F]
+
+    Args:
+        encoder_layer: an instance of the Zipformer2EncoderLayer() class (required).
+        num_layers: the number of sub-encoder-layers in the encoder (required).
+       pos_dim: the dimension for the relative positional encoding
+
+    Examples::
+        >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
+        >>> dualpath_zipformer_encoder = DualPathZipformer2Encoder(encoder_layer, num_layers=6)
+        >>> src = torch.rand(10, 512, 161, 101)
+        >>> out = dualpath_zipformer_encoder(src)
+    """
+
+    def __init__(
+        self,
+        encoder_layer: nn.Module,
+        num_layers: int,
+        pos_dim: int,
+        dropout: float,
+        warmup_begin: float,
+        warmup_end: float,
+        initial_layerdrop_rate: float = 0.5,
+        final_layerdrop_rate: float = 0.05,
+        bypass_layer=None,
+    ) -> None:
+        """
+        Initialize the DualPathZipformer2Encoder module with the specified
+        encoder layer, number of layers, positional dimension, dropout rate, warmup period, and layer drop rates.
+        """
+        super().__init__()
+        self.encoder_pos = CompactRelPositionalEncoding(
+            pos_dim, dropout_rate=0.15, length_factor=1.0)
+
+        self.f_layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for i in range(num_layers)])
+        self.t_layers = nn.ModuleList(
+            [copy.deepcopy(encoder_layer) for i in range(num_layers)])
+        self.bypass_layers = nn.ModuleList(
+            [bypass_layer for i in range(num_layers * 2)])
+        self.num_layers = num_layers
+
+        assert 0 <= warmup_begin <= warmup_end, (warmup_begin, warmup_end)
+
+        delta = (1.0 / num_layers) * (warmup_end - warmup_begin)
+        cur_begin = warmup_begin  # interpreted as a training batch index
+        for i in range(num_layers):
+            cur_end = cur_begin + delta
+            self.f_layers[i].bypass.skip_rate = ScheduledFloat(
+                (cur_begin, initial_layerdrop_rate),
+                (cur_end, final_layerdrop_rate),
+                default=0.0,
+            )
+            self.t_layers[i].bypass.skip_rate = ScheduledFloat(
+                (cur_begin, initial_layerdrop_rate),
+                (cur_end, final_layerdrop_rate),
+                default=0.0,
+            )
+            cur_begin = cur_end
+
+    def forward(
+        self,
+        src: Tensor,
+        chunk_size: int = -1,
+        feature_mask: Union[Tensor, float] = 1.0,
+        attn_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""Pass the input through the encoder layers in a dual-path manner, processing both temporal and frequency dimensions.
+
+        Args:
+            src: the dual-path sequence to the encoder (required):
+                shape (batch_size, embedding_dim, seq_len, frequency_len).
+            chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking. No used.
+            feature_mask: something that broadcasts with src, that we'll multiply `src`
+               by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
+            attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
+                 interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
+                 True means masked position. May be None.
+            src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
+                 masked position.  May be None.
+
+        Returns: a Tensor with the same shape as src.
+        """
+
+        # src: (b, c, t, f)
+        b, c, t, f = src.size()
+        src_f = src.permute(3, 0, 2, 1).contiguous().view(f, b * t, c)
+        src_t = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
+        pos_emb_f = self.encoder_pos(src_f)
+        pos_emb_t = self.encoder_pos(src_t)
+
+        output = src
+
+        if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+            output = output * feature_mask
+
+        for i in range(len(self.f_layers)):
+            # output_org = output
+            # (b, c, t, f)
+            output_f_org = output.permute(3, 2, 0,
+                                          1).contiguous()  # (f, t, b, c)
+            output_f = output_f_org.view(f, t * b, c)
+            # (f, t * b, c)
+            output_f = self.f_layers[i](
+                output_f,
+                pos_emb_f,
+                # chunk_size=chunk_size,
+                # attn_mask=attn_mask,
+                src_key_padding_mask=src_key_padding_mask,
+            )
+            output_f = output_f.view(f, t, b, c)
+            output_f = self.bypass_layers[i * 2](output_f_org, output_f)
+
+            # (f, t, b, c)
+            output = output_f.permute(2, 3, 1, 0).contiguous()
+            # (b, c, t, f)
+            # output = self.bypass_layers[i * 2](output_org, output)
+
+            # output_org = output
+
+            output_t_org = output.permute(2, 3, 0,
+                                          1).contiguous()  # (t, f, b, c)
+            output_t = output_t_org.view(t, f * b, c)
+            output_t = self.t_layers[i](
+                output_t,
+                pos_emb_t,
+                # chunk_size=chunk_size,
+                # attn_mask=attn_mask,
+                src_key_padding_mask=src_key_padding_mask,
+            )
+            output_t = output_t.view(t, f, b, c)
+            output_t = self.bypass_layers[i * 2 + 1](output_t_org, output_t)
+            # (t, f, b, c)
+
+            output = output_t.permute(2, 3, 0, 1).contiguous()
+            # (b, c, t, f)
+            # output = self.bypass_layers[i * 2 + 1](output_org, output)
+
+            if not torch.jit.is_scripting() and not torch.jit.is_tracing():
+                output = output * feature_mask
+
+        return output
+
+
+class DualPathDownsampledZipformer2Encoder(nn.Module):
+    r"""
+    DualPathDownsampledZipformer2Encoder is a dual-path zipformer encoder evaluated at a reduced frame rate,
+    after convolutional downsampling, and then upsampled again at the output, and combined
+    with the origin input, so that the output has the same shape as the input.
+    The features are downsampled-upsampled at the time and frequency domain.
+
+    """
+
+    def __init__(self, encoder: nn.Module, dim: int, t_downsample: int,
+                 f_downsample: int, dropout: FloatLike):
+        """
+        Initialize the DualPathDownsampledZipformer2Encoder module with the specified
+        encoder, dimension, temporal and frequency downsampling factors r, and dropout rate.
+        """
+        super(DualPathDownsampledZipformer2Encoder, self).__init__()
+        self.downsample_factor = t_downsample
+        self.t_downsample_factor = t_downsample
+        self.f_downsample_factor = f_downsample
+
+        if self.t_downsample_factor != 1:
+            self.downsample_t = SimpleDownsample(dim, t_downsample, dropout)
+            self.upsample_t = SimpleUpsample(dim, t_downsample)
+        if self.f_downsample_factor != 1:
+            self.downsample_f = SimpleDownsample(dim, f_downsample, dropout)
+            self.upsample_f = SimpleUpsample(dim, f_downsample)
+
+        # self.num_layers = encoder.num_layers
+        self.encoder = encoder
+
+        self.out_combiner = BypassModule(dim, straight_through_rate=0)
+
+    def forward(
+        self,
+        src: Tensor,
+        chunk_size: int = -1,
+        feature_mask: Union[Tensor, float] = 1.0,
+        attn_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""Downsample the input, process through the encoder, and then upsample back to the original shape.
+
+        Args:
+            src: the sequence to the encoder (required): shape (batch_size, embedding_dim, seq_len, frequency_len).
+            feature_mask: 1.0
+            attn_mask: None
+            src_key_padding_mask: None.
+
+        Returns: a Tensor with the same shape as src. (batch_size, embedding_dim, seq_len, frequency_len)
+        """
+        # src: (b, c, t, f)
+        b, c, t, f = src.size()
+        # print(src.size())
+
+        src_orig = src.permute(2, 3, 0, 1)  # (t, f, b, c)
+
+        # (b, c, t, f)
+        src = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c)
+        # -> (t, b * f, c)
+        if self.t_downsample_factor != 1:
+            src = self.downsample_t(src)
+        # (t//ds + 1, b * f, c)
+        downsample_t = src.size(0)
+        src = src.view(downsample_t, b, f,
+                       c).permute(2, 1, 0,
+                                  3).contiguous().view(f, b * downsample_t, c)
+        # src = self.upsample_f(src)
+        if self.f_downsample_factor != 1:
+            src = self.downsample_f(src)
+        # (f//ds + 1, b * downsample_t, c)
+        downsample_f = src.size(0)
+        src = src.view(downsample_f, b, downsample_t, c).permute(1, 3, 2, 0)
+        # (b, c, downsample_t, downsample_f)
+        # print(src.size())
+
+        # ds = self.downsample_factor
+        # if attn_mask is not None:
+        #     attn_mask = attn_mask[::ds, ::ds]
+
+        src = self.encoder(
+            src,
+            chunk_size=chunk_size,
+            feature_mask=feature_mask,
+            attn_mask=attn_mask,
+            src_key_padding_mask=src_key_padding_mask,
+        )
+
+        # (b, c, downsample_t, downsample_f)
+        src = src.permute(3, 0, 2,
+                          1).contiguous().view(downsample_f, b * downsample_t,
+                                               c)
+        if self.f_downsample_factor != 1:
+            src = self.upsample_f(src)
+        # (f, b * downsample_t, c)
+        src = src[:f].view(f, b, downsample_t,
+                           c).permute(2, 1, 0, 3).contiguous().view(
+                               downsample_t, b * f, c)
+        # (downsample_t, b * f, c)
+        if self.t_downsample_factor != 1:
+            src = self.upsample_t(src)
+        # (t, b * f, c)
+        src = src[:t].view(t, b, f, c).permute(0, 2, 1, 3).contiguous()
+        # (t, f, b, c)
+        out = self.out_combiner(src_orig, src)
+        # (t, f, b, c)
+
+        out = out.permute(2, 3, 0, 1).contiguous()
+        # (b, c, t, f)
+        # print(out.size())
+
+        # remove any extra frames that are not a multiple of downsample_factor
+        # src = src[: src_orig.shape[0]] # slice here
+
+        return out
+
+
+class Zipformer2DualPathEncoder(nn.Module):
+
+    def __init__(
+        self,
+        output_downsampling_factor: int = 2,
+        downsampling_factor: Tuple[int] = (2, 4),
+        f_downsampling_factor: Tuple[int] = None,
+        encoder_dim: Union[int, Tuple[int]] = 384,
+        num_encoder_layers: Union[int, Tuple[int]] = 4,
+        encoder_unmasked_dim: Union[int, Tuple[int]] = 256,
+        query_head_dim: Union[int, Tuple[int]] = 24,
+        pos_head_dim: Union[int, Tuple[int]] = 4,
+        value_head_dim: Union[int, Tuple[int]] = 12,
+        num_heads: Union[int, Tuple[int]] = 8,
+        feedforward_dim: Union[int, Tuple[int]] = 1536,
+        cnn_module_kernel: Union[int, Tuple[int]] = 31,
+        pos_dim: int = 192,
+        dropout: FloatLike = None,  # see code below for default
+        warmup_batches: float = 4000.0,
+        causal: bool = False,
+        chunk_size: Tuple[int] = [-1],
+        left_context_frames: Tuple[int] = [-1],
+    ):
+        """
+        Initialize the Zipformer2DualPathEncoder module.
+        Zipformer2DualPathEncoder processes the hidden features of the noisy speech using dual-path modeling.
+        It has two kinds of blocks: DualPathZipformer2Encoder and DualPathDownsampledZipformer2Encoder.
+        DualPathZipformer2Encoder processes the 4D features with the shape of [B, C, T, F].
+        DualPathDownsampledZipformer2Encoder first downsamples the hidden features
+        and processes features using dual-path modeling like DualPathZipformer2Encoder.
+
+        Args:
+        Various hyperparameters and settings for the encoder.
+        """
+        super(Zipformer2DualPathEncoder, self).__init__()
+
+        if dropout is None:
+            dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
+
+        def _to_tuple(x):
+            """Converts a single int or a 1-tuple of an int to a tuple with the same length
+            as downsampling_factor"""
+            if isinstance(x, int):
+                x = (x, )
+            if len(x) == 1:
+                x = x * len(downsampling_factor)
+            else:
+                assert len(x) == len(downsampling_factor) and isinstance(
+                    x[0], int)
+            return x
+
+        self.output_downsampling_factor = output_downsampling_factor  # int
+        self.downsampling_factor = downsampling_factor  # tuple
+
+        if f_downsampling_factor is None:
+            f_downsampling_factor = downsampling_factor
+        self.f_downsampling_factor = _to_tuple(f_downsampling_factor)
+
+        self.encoder_dim = encoder_dim = _to_tuple(encoder_dim)  # tuple
+        self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple(
+            encoder_unmasked_dim)  # tuple
+        num_encoder_layers = _to_tuple(num_encoder_layers)
+        self.num_encoder_layers = num_encoder_layers
+        self.query_head_dim = query_head_dim = _to_tuple(query_head_dim)
+        self.value_head_dim = value_head_dim = _to_tuple(value_head_dim)
+        pos_head_dim = _to_tuple(pos_head_dim)
+        self.num_heads = num_heads = _to_tuple(num_heads)
+        feedforward_dim = _to_tuple(feedforward_dim)
+        self.cnn_module_kernel = cnn_module_kernel = _to_tuple(
+            cnn_module_kernel)
+
+        self.causal = causal
+        self.chunk_size = chunk_size
+        self.left_context_frames = left_context_frames
+
+        for u, d in zip(encoder_unmasked_dim, encoder_dim):
+            assert u <= d
+
+        # each one will be Zipformer2Encoder or DownsampledZipformer2Encoder
+        encoders = []
+
+        num_encoders = len(downsampling_factor)
+        # "1,2,4,8,4,2",
+
+        for i in range(num_encoders):
+            encoder_layer = Zipformer2EncoderLayer(
+                embed_dim=encoder_dim[i],
+                pos_dim=pos_dim,
+                num_heads=num_heads[i],
+                query_head_dim=query_head_dim[i],
+                pos_head_dim=pos_head_dim[i],
+                value_head_dim=value_head_dim[i],
+                feedforward_dim=feedforward_dim[i],
+                dropout=dropout,
+                cnn_module_kernel=cnn_module_kernel[i],
+                causal=causal,
+            )
+
+            # For the segment of the warmup period, we let the Conv2dSubsampling
+            # layer learn something.  Then we start to warm up the other encoders.
+            encoder = DualPathZipformer2Encoder(
+                encoder_layer,
+                num_encoder_layers[i],
+                pos_dim=pos_dim,
+                dropout=dropout,
+                warmup_begin=warmup_batches * (i + 1) / (num_encoders + 1),
+                warmup_end=warmup_batches * (i + 2) / (num_encoders + 1),
+                final_layerdrop_rate=0.035 * (downsampling_factor[i]**0.5),
+                bypass_layer=BypassModule(
+                    encoder_dim[i], straight_through_rate=0),
+            )
+
+            if downsampling_factor[i] != 1 or f_downsampling_factor[i] != 1:
+                encoder = DualPathDownsampledZipformer2Encoder(
+                    encoder,
+                    dim=encoder_dim[i],
+                    t_downsample=downsampling_factor[i],
+                    f_downsample=f_downsampling_factor[i],
+                    dropout=dropout,
+                )
+
+            encoders.append(encoder)
+
+        self.encoders = nn.ModuleList(encoders)
+
+        self.downsample_output = SimpleDownsample(
+            max(encoder_dim),
+            downsample=output_downsampling_factor,
+            dropout=dropout)
+
+    def forward(self, x):
+        """
+        Forward pass of the Zipformer2DualPathEncoder module.
+
+        Args:
+        x (Tensor): Input tensor of shape [B, C, T, F].
+
+        Returns:
+        Tensor: Output tensor after passing through the encoder.
+        """
+        outputs = []
+
+        # if torch.jit.is_scripting() or torch.jit.is_tracing():
+        #     feature_masks = [1.0] * len(self.encoder_dim)
+        # else:
+        # feature_masks = self.get_feature_masks(x)
+        feature_masks = [1.0] * len(self.encoder_dim)
+        attn_mask = None
+
+        chunk_size = -1
+        # left_context_chunks = -1
+
+        for i, module in enumerate(self.encoders):
+
+            x = convert_num_channels(x, self.encoder_dim[i])
+
+            x = module(
+                x,
+                chunk_size=chunk_size,
+                feature_mask=feature_masks[i],
+                src_key_padding_mask=None,
+                attn_mask=attn_mask,
+            )
+            outputs.append(x)
+
+        # (b, c, t, f)
+        return x
+
+
+if __name__ == '__main__':
+
+    # {2,2,2,2,2,2} {192,256,256,256,256,256} {512,768,768,768,768,768}
+    downsampling_factor = (1, 2, 4, 3)  #
+    encoder_dim = (16, 32, 64, 64)
+    pos_dim = 48  # zipformer base设置
+    num_heads = (4, 4, 4, 4)  # "4,4,4,8,4,4"
+    query_head_dim = (16, ) * len(downsampling_factor)  # 32
+    pos_head_dim = (4, ) * len(downsampling_factor)  # 4
+    value_head_dim = (12, ) * len(downsampling_factor)  # 12
+    feedforward_dim = (32, 64, 128, 128)  #
+    dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1))
+    cnn_module_kernel = (15, ) * len(downsampling_factor)  # 31,31,15,15,15,31
+    causal = False
+    encoder_unmasked_dim = (16, ) * len(downsampling_factor)
+
+    num_encoder_layers = (1, 1, 1, 1)
+    warmup_batches = 4000.0
+
+    net = Zipformer2DualPathEncoder(
+        output_downsampling_factor=1,
+        downsampling_factor=downsampling_factor,
+        num_encoder_layers=num_encoder_layers,
+        encoder_dim=encoder_dim,
+        encoder_unmasked_dim=encoder_unmasked_dim,
+        query_head_dim=query_head_dim,
+        pos_head_dim=pos_head_dim,
+        value_head_dim=value_head_dim,
+        pos_dim=pos_dim,
+        num_heads=num_heads,
+        feedforward_dim=feedforward_dim,
+        cnn_module_kernel=cnn_module_kernel,
+        dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)),
+        warmup_batches=warmup_batches,
+        causal=causal,
+    )
+
+    # net = DownsampledZipformer2Encoder(
+    #     None, 128, 2, 0.
+    # )
+    # x = torch.randn((101, 2, 128))
+    b = 4
+    t = 321
+    f = 101
+    c = 64
+
+    # x = torch.randn((101, 2, 128))
+    x = torch.randn((b, c, t, f))
+
+    x = net(x)
+    print(x.size())
diff --git a/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py b/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py
new file mode 100644
index 000000000..42ad6df1d
--- /dev/null
+++ b/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py
@@ -0,0 +1,1084 @@
+#!/usr/bin/env python3
+# Copyright    2022-2023  Xiaomi Corp.        (authors: Daniel Povey,
+#                                                       Zengwei Yao)
+# Copyright (c) 2024 Alibaba, Inc. and its affiliates.
+#
+# See ../../../../LICENSE for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import logging
+import math
+import random
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor, nn
+
+from .scaling import \
+    Identity  # more friendly to backward hooks than nn.Identity(), for diagnostic reasons.
+from .scaling import \
+    ScaledLinear  # not as in other dirs.. just scales down initial parameter values.
+from .scaling import (ActivationDropoutAndLinear, BiasNorm,
+                      ChunkCausalDepthwiseConv1d, Dropout2, FloatLike,
+                      ScheduledFloat, limit_param_value,
+                      penalize_abs_values_gt, softmax)
+
+
+class Zipformer2EncoderLayer(nn.Module):
+    """
+    Args:
+        embed_dim: the number of expected features in the input (required).
+        nhead: the number of heads in the multiheadattention models (required).
+        feedforward_dim: the dimension of the feedforward network model (required).
+        dropout: the dropout value (default=0.1).
+        cnn_module_kernel (int): Kernel size of convolution module (default=31).
+
+    Examples::
+        >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8)
+        >>> src = torch.rand(10, 32, 512)
+        >>> pos_emb = torch.rand(32, 19, 512)
+        >>> out = encoder_layer(src, pos_emb)
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        pos_dim: int,
+        num_heads: int,
+        query_head_dim: int,
+        pos_head_dim: int,
+        value_head_dim: int,
+        feedforward_dim: int,
+        dropout: FloatLike = 0.1,
+        cnn_module_kernel: int = 31,
+        causal: bool = False,
+        attention_skip_rate: FloatLike = ScheduledFloat(
+            (0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0),
+        conv_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05),
+                                                   (16000, 0.0),
+                                                   default=0),
+        const_attention_rate: FloatLike = ScheduledFloat((0.0, 0.25),
+                                                         (4000.0, 0.025),
+                                                         default=0),
+        ff2_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01),
+                                                  (50000.0, 0.0)),
+        ff3_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01),
+                                                  (50000.0, 0.0)),
+        bypass_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5),
+                                                     (4000.0, 0.02),
+                                                     default=0),
+    ) -> None:
+        super(Zipformer2EncoderLayer, self).__init__()
+        self.embed_dim = embed_dim
+
+        # self.bypass implements layer skipping as well as bypass; see its default values.
+        self.bypass = BypassModule(
+            embed_dim, skip_rate=bypass_skip_rate, straight_through_rate=0)
+        # bypass_mid is bypass used in the middle of the layer.
+        self.bypass_mid = BypassModule(embed_dim, straight_through_rate=0)
+
+        # skip probability for dynamic modules (meaning: anything but feedforward).
+        self.attention_skip_rate = copy.deepcopy(attention_skip_rate)
+        # an additional skip probability that applies to ConvModule to stop it from
+        # contributing too much early on.
+        self.conv_skip_rate = copy.deepcopy(conv_skip_rate)
+
+        # ff2_skip_rate is to prevent the ff2 module from having output that's too big
+        # compared to its residual.
+        self.ff2_skip_rate = copy.deepcopy(ff2_skip_rate)
+        self.ff3_skip_rate = copy.deepcopy(ff3_skip_rate)
+
+        self.const_attention_rate = copy.deepcopy(const_attention_rate)
+
+        self.self_attn_weights = RelPositionMultiheadAttentionWeights(
+            embed_dim,
+            pos_dim=pos_dim,
+            num_heads=num_heads,
+            query_head_dim=query_head_dim,
+            pos_head_dim=pos_head_dim,
+            dropout=0.0,
+        )
+
+        self.self_attn1 = SelfAttention(embed_dim, num_heads, value_head_dim)
+
+        self.self_attn2 = SelfAttention(embed_dim, num_heads, value_head_dim)
+
+        self.feed_forward1 = FeedforwardModule(embed_dim,
+                                               (feedforward_dim * 3) // 4,
+                                               dropout)
+
+        self.feed_forward2 = FeedforwardModule(embed_dim, feedforward_dim,
+                                               dropout)
+
+        self.feed_forward3 = FeedforwardModule(embed_dim,
+                                               (feedforward_dim * 5) // 4,
+                                               dropout)
+
+        self.nonlin_attention = NonlinAttention(
+            embed_dim, hidden_channels=3 * embed_dim // 4)
+
+        self.conv_module1 = ConvolutionModule(
+            embed_dim, cnn_module_kernel, causal=causal)
+
+        self.conv_module2 = ConvolutionModule(
+            embed_dim, cnn_module_kernel, causal=causal)
+
+        # TODO: remove it
+        self.bypass_scale = nn.Parameter(torch.full((embed_dim, ), 0.5))
+
+        self.norm = BiasNorm(embed_dim)
+
+        self.balancer1 = Identity()
+        self.balancer_na = Identity()
+        self.balancer_ff2 = Identity()
+        self.balancer_ff3 = Identity()
+        self.whiten = Identity()
+        self.balancer2 = Identity()
+
+    def get_sequence_dropout_mask(self, x: Tensor,
+                                  dropout_rate: float) -> Optional[Tensor]:
+        if (dropout_rate == 0.0 or not self.training
+                or torch.jit.is_scripting() or torch.jit.is_tracing()):
+            return None
+        batch_size = x.shape[1]
+        mask = (torch.rand(batch_size, 1, device=x.device) > dropout_rate).to(
+            x.dtype)
+        return mask
+
+    def sequence_dropout(self, x: Tensor, dropout_rate: float) -> Tensor:
+        """
+        Apply sequence-level dropout to x.
+        x shape: (seq_len, batch_size, embed_dim)
+        """
+        dropout_mask = self.get_sequence_dropout_mask(x, dropout_rate)
+        if dropout_mask is None:
+            return x
+        else:
+            return x * dropout_mask
+
+    def forward(
+        self,
+        src: Tensor,
+        pos_emb: Tensor,
+        chunk_size: int = -1,
+        attn_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        """
+            Pass the input through the encoder layer.
+            Args:
+                src: the sequence to the encoder (required): shape (seq_len, batch_size, embedding_dim).
+             pos_emb: (1, 2*seq_len-1, pos_emb_dim) or (batch_size, 2*seq_len-1, pos_emb_dim)
+             chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking.
+           feature_mask: something that broadcasts with src, that we'll multiply `src`
+                  by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim)
+             attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len),
+                    interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len).
+                   True means masked position. May be None.
+        src_key_padding_mask:  the mask for padding, of shape (batch_size, seq_len); True means
+                 masked position.  May be None.
+
+            Returns:
+               A tensor which has the same shape as src
+        """
+        src_orig = src
+
+        # dropout rate for non-feedforward submodules
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            attention_skip_rate = 0.0
+        else:
+            attention_skip_rate = (
+                float(self.attention_skip_rate) if self.training else 0.0)
+
+        # attn_weights: (num_heads, batch_size, seq_len, seq_len)
+        attn_weights = self.self_attn_weights(
+            src,
+            pos_emb=pos_emb,
+            attn_mask=attn_mask,
+            key_padding_mask=src_key_padding_mask,
+        )
+
+        src = src + self.feed_forward1(src)
+
+        self_attn_dropout_mask = self.get_sequence_dropout_mask(
+            src, attention_skip_rate)
+
+        selected_attn_weights = attn_weights[0:1]
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            pass
+        elif self.training and random.random() < float(
+                self.const_attention_rate):
+            # Make attention weights constant.  The intention is to
+            # encourage these modules to do something similar to an
+            # averaging-over-time operation.
+            # only need the mask, can just use the 1st one and expand later
+            selected_attn_weights = selected_attn_weights[0:1]
+            selected_attn_weights = (selected_attn_weights > 0.0).to(
+                selected_attn_weights.dtype)
+            selected_attn_weights = selected_attn_weights * (
+                1.0 / selected_attn_weights.sum(dim=-1, keepdim=True))
+
+        na = self.balancer_na(
+            self.nonlin_attention(src, selected_attn_weights))
+
+        src = src + (
+            na if self_attn_dropout_mask is None else na
+            * self_attn_dropout_mask)
+
+        self_attn = self.self_attn1(src, attn_weights)
+
+        src = src + (
+            self_attn if self_attn_dropout_mask is None else self_attn
+            * self_attn_dropout_mask)
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            conv_skip_rate = 0.0
+        else:
+            conv_skip_rate = float(
+                self.conv_skip_rate) if self.training else 0.0
+
+        src = src + self.sequence_dropout(
+            self.conv_module1(
+                src,
+                chunk_size=chunk_size,
+                src_key_padding_mask=src_key_padding_mask),
+            conv_skip_rate,
+        )
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            ff2_skip_rate = 0.0
+        else:
+            ff2_skip_rate = float(self.ff2_skip_rate) if self.training else 0.0
+        src = src + self.sequence_dropout(
+            self.balancer_ff2(self.feed_forward2(src)), ff2_skip_rate)
+
+        # bypass in the middle of the layer.
+        src = self.bypass_mid(src_orig, src)
+
+        self_attn = self.self_attn2(src, attn_weights)
+
+        src = src + (
+            self_attn if self_attn_dropout_mask is None else self_attn
+            * self_attn_dropout_mask)
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            conv_skip_rate = 0.0
+        else:
+            conv_skip_rate = float(
+                self.conv_skip_rate) if self.training else 0.0
+
+        src = src + self.sequence_dropout(
+            self.conv_module2(
+                src,
+                chunk_size=chunk_size,
+                src_key_padding_mask=src_key_padding_mask),
+            conv_skip_rate,
+        )
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            ff3_skip_rate = 0.0
+        else:
+            ff3_skip_rate = float(self.ff3_skip_rate) if self.training else 0.0
+        src = src + self.sequence_dropout(
+            self.balancer_ff3(self.feed_forward3(src)), ff3_skip_rate)
+
+        src = self.balancer1(src)
+        src = self.norm(src)
+
+        src = self.bypass(src_orig, src)
+
+        src = self.balancer2(src)
+        src = self.whiten(src)
+
+        return src
+
+
+class BypassModule(nn.Module):
+    """
+    An nn.Module that implements a learnable bypass scale, and also randomized per-sequence
+    layer-skipping.  The bypass is limited during early stages of training to be close to
+    "straight-through", i.e. to not do the bypass operation much initially, in order to
+    force all the modules to learn something.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        skip_rate: FloatLike = 0.0,
+        straight_through_rate: FloatLike = 0.0,
+        scale_min: FloatLike = ScheduledFloat((0.0, 0.9), (20000.0, 0.2),
+                                              default=0),
+        scale_max: FloatLike = 1.0,
+    ):
+        super().__init__()
+        self.bypass_scale = nn.Parameter(torch.full((embed_dim, ), 0.5))
+        self.skip_rate = copy.deepcopy(skip_rate)
+        self.straight_through_rate = copy.deepcopy(straight_through_rate)
+        self.scale_min = copy.deepcopy(scale_min)
+        self.scale_max = copy.deepcopy(scale_max)
+
+    def _get_bypass_scale(self, batch_size: int):
+        # returns bypass-scale of shape (num_channels,),
+        # or (batch_size, num_channels,).  This is actually the
+        # scale on the non-residual term, so 0 corresponds to bypassing
+        # this module.
+        if torch.jit.is_scripting() or torch.jit.is_tracing(
+        ) or not self.training:
+            return self.bypass_scale
+        else:
+            ans = limit_param_value(
+                self.bypass_scale,
+                min=float(self.scale_min),
+                max=float(self.scale_max))
+            skip_rate = float(self.skip_rate)
+            if skip_rate != 0.0:
+                mask = torch.rand(
+                    (batch_size, 1), device=ans.device) > skip_rate
+                ans = ans * mask
+                # now ans is of shape (batch_size, num_channels), and is zero for sequences
+                # on which we have randomly chosen to do layer-skipping.
+            straight_through_rate = float(self.straight_through_rate)
+            if straight_through_rate != 0.0:
+                _rand_tensor = torch.rand((batch_size, 1), device=ans.device)
+                mask = (_rand_tensor < straight_through_rate)
+                ans = torch.maximum(ans, mask.to(ans.dtype))
+            return ans
+
+    def forward(self, src_orig: Tensor, src: Tensor):
+        """
+        Args: src_orig and src are both of shape (seq_len, batch_size, num_channels)
+        Returns: something with the same shape as src and src_orig
+        """
+        # bypass_scale = self._get_bypass_scale(src.shape[1])
+        bypass_scale = self._get_bypass_scale(src.shape[-2])
+        return src_orig + (src - src_orig) * bypass_scale
+
+
+class SimpleDownsample(torch.nn.Module):
+    """
+    Does downsampling with attention, by weighted sum, and a projection..
+    """
+
+    def __init__(self, channels: int, downsample: int, dropout: FloatLike):
+        super(SimpleDownsample, self).__init__()
+
+        self.bias = nn.Parameter(torch.zeros(downsample))
+
+        self.name = None  # will be set from training code
+        self.dropout = copy.deepcopy(dropout)
+
+        self.downsample = downsample
+
+    def forward(self, src: Tensor) -> Tensor:
+        """
+        x: (seq_len, batch_size, in_channels)
+        Returns a tensor of shape
+           ( (seq_len+downsample-1)//downsample, batch_size, channels)
+        """
+        (seq_len, batch_size, in_channels) = src.shape
+        ds = self.downsample
+        d_seq_len = (seq_len + ds - 1) // ds
+
+        # Pad to an exact multiple of self.downsample
+        # right-pad src, repeating the last element.
+        pad = d_seq_len * ds - seq_len
+        src_extra = src[src.shape[0] - 1:].expand(pad, src.shape[1],
+                                                  src.shape[2])
+        src = torch.cat((src, src_extra), dim=0)
+        assert src.shape[0] == d_seq_len * ds
+
+        src = src.reshape(d_seq_len, ds, batch_size, in_channels)
+
+        weights = self.bias.softmax(dim=0)
+        # weights: (downsample, 1, 1)
+        weights = weights.unsqueeze(-1).unsqueeze(-1)
+
+        # ans1 is the first `in_channels` channels of the output
+        ans = (src * weights).sum(dim=1)
+
+        return ans
+
+
+class SimpleUpsample(torch.nn.Module):
+    """
+    A very simple form of upsampling that mostly just repeats the input, but
+    also adds a position-specific bias.
+    """
+
+    def __init__(self, num_channels: int, upsample: int):
+        super(SimpleUpsample, self).__init__()
+        self.upsample = upsample
+
+    def forward(self, src: Tensor) -> Tensor:
+        """
+        x: (seq_len, batch_size, num_channels)
+        Returns a tensor of shape
+           ( (seq_len*upsample), batch_size, num_channels)
+        """
+        upsample = self.upsample
+        (seq_len, batch_size, num_channels) = src.shape
+        src = src.unsqueeze(1).expand(seq_len, upsample, batch_size,
+                                      num_channels)
+        src = src.reshape(seq_len * upsample, batch_size, num_channels)
+        return src
+
+
+class CompactRelPositionalEncoding(torch.nn.Module):
+    """
+    Relative positional encoding module.  This version is "compact" meaning it is able to encode
+    the important information about the relative position in a relatively small number of dimensions.
+    The goal is to make it so that small differences between large relative offsets (e.g. 1000 vs. 1001)
+    make very little difference to the embedding.   Such differences were potentially important
+    when encoding absolute position, but not important when encoding relative position because there
+    is now no need to compare two large offsets with each other.
+
+    Our embedding works by projecting the interval [-infinity,infinity] to a finite interval
+    using the atan() function, before doing the Fourier transform of that fixed interval.  The
+    atan() function would compress the "long tails" too small,
+    making it hard to distinguish between different magnitudes of large offsets, so we use a logarithmic
+    function to compress large offsets to a smaller range before applying atan().
+    Scalings are chosen in such a way that the embedding can clearly distinguish individual offsets as long
+    as they are quite close to the origin, e.g. abs(offset) <= about sqrt(embedding_dim)
+
+
+    Args:
+        embed_dim: Embedding dimension.
+        dropout_rate: Dropout rate.
+        max_len: Maximum input length: just a heuristic for initialization.
+        length_factor: a heuristic scale (should be >= 1.0) which, if larger, gives
+           less weight to small differences of offset near the origin.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        dropout_rate: FloatLike,
+        max_len: int = 1000,
+        length_factor: float = 1.0,
+    ) -> None:
+        """Construct a CompactRelPositionalEncoding object."""
+        super(CompactRelPositionalEncoding, self).__init__()
+        self.embed_dim = embed_dim
+        assert embed_dim % 2 == 0, embed_dim
+        self.dropout = Dropout2(dropout_rate)
+        self.pe = None
+        assert length_factor >= 1.0, length_factor
+        self.length_factor = length_factor
+        self.extend_pe(torch.tensor(0.0).expand(max_len))
+
+    def extend_pe(self, x: Tensor, left_context_len: int = 0) -> None:
+        """Reset the positional encodings."""
+        T = x.size(0) + left_context_len
+
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(0) >= T * 2 - 1:
+                self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+
+        # if T == 4, x would contain [ -3, -2, 1, 0, 1, 2, 3 ]
+        x = torch.arange(
+            -(T - 1), T, device=x.device).to(torch.float32).unsqueeze(1)
+
+        freqs = 1 + torch.arange(self.embed_dim // 2, device=x.device)
+
+        # `compression_length` this is arbitrary/heuristic, if it is larger we have more resolution
+        # for small time offsets but less resolution for large time offsets.
+        compression_length = self.embed_dim**0.5
+        # x_compressed, like X, goes from -infinity to infinity as T goes from -infinity to infinity;
+        # but it does so more slowly than T for large absolute values of T.
+        # The formula is chosen so that d(x_compressed )/dx is 1 around x == 0, which
+        # is important.
+        _tmp_tensor = ((x.abs() + compression_length).log()
+                       - math.log(compression_length))
+        x_compressed = (compression_length * x.sign() * _tmp_tensor)
+
+        # if self.length_factor == 1.0, then length_scale is chosen so that the
+        # FFT can exactly separate points close to the origin (T == 0).  So this
+        # part of the formulation is not really heuristic.
+        # But empirically, for ASR at least, length_factor > 1.0 seems to work better.
+        length_scale = self.length_factor * self.embed_dim / (2.0 * math.pi)
+
+        # note for machine implementations: if atan is not available, we can use:
+        #   x.sign() * ((1 / (x.abs() + 1)) - 1)  * (-math.pi/2)
+        #  check on wolframalpha.com: plot(sign(x) *  (1 / ( abs(x) + 1) - 1 ) * -pi/2 , atan(x))
+        x_atan = (x_compressed
+                  / length_scale).atan()  # results between -pi and pi
+
+        cosines = (x_atan * freqs).cos()
+        sines = (x_atan * freqs).sin()
+
+        pe = torch.zeros(x.shape[0], self.embed_dim, device=x.device)
+        pe[:, 0::2] = cosines
+        pe[:, 1::2] = sines
+        pe[:, -1] = 1.0  # for bias.
+
+        self.pe = pe.to(dtype=x.dtype)
+
+    def forward(self, x: Tensor, left_context_len: int = 0) -> Tensor:
+        """Create positional encoding.
+
+        Args:
+            x (Tensor): Input tensor (time, batch, `*`).
+            left_context_len: (int): Length of cached left context.
+
+        Returns:
+            positional embedding, of shape (batch, left_context_len + 2*time-1, `*`).
+        """
+        self.extend_pe(x, left_context_len)
+        x_size_left = x.size(0) + left_context_len
+        # length of positive side: x.size(0) + left_context_len
+        # length of negative side: x.size(0)
+        pos_emb = self.pe[self.pe.size(0) // 2 - x_size_left
+                          + 1:self.pe.size(0) // 2  # noqa E203
+                          + x.size(0), :, ]
+        pos_emb = pos_emb.unsqueeze(0)
+        return self.dropout(pos_emb)
+
+
+class RelPositionMultiheadAttentionWeights(nn.Module):
+    r"""Module that computes multi-head attention weights with relative position encoding.
+    Various other modules consume the resulting attention weights: see, for example, the
+    SimpleAttention module which allows you to compute conventional attention.
+
+    This is a quite heavily modified from: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context",
+    we have to write up the differences.
+
+
+    Args:
+           embed_dim: number of channels at the input to this module, e.g. 256
+             pos_dim: dimension of the positional encoding vectors, e.g. 128.
+           num_heads:  number of heads to compute weights for, e.g. 8
+     query_head_dim: dimension of the query (and key), per head.  e.g. 24.
+       pos_head_dim: dimension of the projected positional encoding per head, e.g. 4.
+            dropout: dropout probability for attn_output_weights. Default: 0.0.
+       pos_emb_skip_rate: probability for skipping the pos_emb part of the scores on
+                     any given call to forward(), in training time.
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        pos_dim: int,
+        num_heads: int,
+        query_head_dim: int,
+        pos_head_dim: int,
+        dropout: float = 0.0,
+        pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5),
+                                                      (4000.0, 0.0)),
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.query_head_dim = query_head_dim
+        self.pos_head_dim = pos_head_dim
+        self.dropout = dropout
+        self.pos_emb_skip_rate = copy.deepcopy(pos_emb_skip_rate)
+        self.name = None  # will be overwritten in training code; for diagnostics.
+
+        key_head_dim = query_head_dim
+        in_proj_dim = (query_head_dim + key_head_dim
+                       + pos_head_dim) * num_heads
+
+        # the initial_scale is supposed to take over the "scaling" factor of
+        # head_dim ** -0.5 that has been used in previous forms of attention,
+        # dividing it between the query and key.   Note: this module is intended
+        # to be used with the ScaledAdam optimizer; with most other optimizers,
+        # it would be necessary to apply the scaling factor in the forward function.
+        self.in_proj = ScaledLinear(
+            embed_dim,
+            in_proj_dim,
+            bias=True,
+            initial_scale=query_head_dim**-0.25)
+
+        self.whiten_keys = Identity()
+        self.balance_keys = Identity()
+
+        # linear transformation for positional encoding.
+        self.linear_pos = ScaledLinear(
+            pos_dim, num_heads * pos_head_dim, bias=False, initial_scale=0.05)
+
+        # the following are for diagnostics only, see --print-diagnostics option
+        self.copy_pos_query = Identity()
+        self.copy_query = Identity()
+
+    def forward(
+        self,
+        x: Tensor,
+        pos_emb: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        attn_mask: Optional[Tensor] = None,
+    ) -> Tensor:
+        r"""
+        Args:
+            x: input of shape (seq_len, batch_size, embed_dim)
+            pos_emb: Positional embedding tensor, of shape (1, 2*seq_len - 1, pos_dim)
+            key_padding_mask: a bool tensor of shape (batch_size, seq_len).  Positions that
+               are True in this mask will be ignored as sources in the attention weighting.
+            attn_mask: mask of shape (seq_len, seq_len) or (batch_size, seq_len, seq_len),
+               interpreted as ([batch_size,] tgt_seq_len, src_seq_len)
+               saying which positions are allowed to attend to which other positions.
+        Returns:
+           a tensor of attention weights, of shape (hum_heads, batch_size, seq_len, seq_len)
+           interpreted as (hum_heads, batch_size, tgt_seq_len, src_seq_len).
+        """
+        x = self.in_proj(x)
+        query_head_dim = self.query_head_dim
+        pos_head_dim = self.pos_head_dim
+        num_heads = self.num_heads
+
+        seq_len, batch_size, _ = x.shape
+
+        query_dim = query_head_dim * num_heads
+
+        # self-attention
+        q = x[..., 0:query_dim]
+        k = x[..., query_dim:2 * query_dim]
+        # p is the position-encoding query
+        p = x[..., 2 * query_dim:]
+        assert p.shape[-1] == num_heads * pos_head_dim, (p.shape[-1],
+                                                         num_heads,
+                                                         pos_head_dim)
+
+        q = self.copy_query(q)  # for diagnostics only, does nothing.
+        k = self.whiten_keys(
+            self.balance_keys(k))  # does nothing in the forward pass.
+        p = self.copy_pos_query(p)  # for diagnostics only, does nothing.
+
+        q = q.reshape(seq_len, batch_size, num_heads, query_head_dim)
+        p = p.reshape(seq_len, batch_size, num_heads, pos_head_dim)
+        k = k.reshape(seq_len, batch_size, num_heads, query_head_dim)
+
+        # time1 refers to target, time2 refers to source.
+        q = q.permute(2, 1, 0, 3)  # (head, batch, time1, query_head_dim)
+        p = p.permute(2, 1, 0, 3)  # (head, batch, time1, pos_head_dim)
+        k = k.permute(2, 1, 3, 0)  # (head, batch, d_k, time2)
+
+        # print(f"MHSAW {q.shape} {k.shape}")
+        attn_scores = torch.matmul(q, k)
+
+        use_pos_scores = False
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            # We can't put random.random() in the same line
+            use_pos_scores = True
+        elif not self.training or random.random() >= float(
+                self.pos_emb_skip_rate):
+            use_pos_scores = True
+
+        if use_pos_scores:
+            pos_emb = self.linear_pos(pos_emb)
+            seq_len2 = 2 * seq_len - 1
+            pos_emb = pos_emb.reshape(-1, seq_len2, num_heads,
+                                      pos_head_dim).permute(2, 0, 3, 1)
+            # pos shape now: (head, {1 or batch_size}, pos_dim, seq_len2)
+
+            # (head, batch, time1, pos_dim) x (head, 1, pos_dim, seq_len2) -> (head, batch, time1, seq_len2)
+            #  [where seq_len2 represents relative position.]
+            # print(f"MHSAW pos {p.shape} {pos_emb.shape}")
+            pos_scores = torch.matmul(p, pos_emb)
+            # the following .as_strided() expression converts the last axis of pos_scores from relative
+            # to absolute position.  I don't know whether I might have got the time-offsets backwards or
+            # not, but let this code define which way round it is supposed to be.
+            if torch.jit.is_tracing():
+                (num_heads, batch_size, time1, n) = pos_scores.shape
+                rows = torch.arange(start=time1 - 1, end=-1, step=-1)
+                cols = torch.arange(seq_len)
+                rows = rows.repeat(batch_size * num_heads).unsqueeze(-1)
+                indexes = rows + cols
+                pos_scores = pos_scores.reshape(-1, n)
+                pos_scores = torch.gather(pos_scores, dim=1, index=indexes)
+                pos_scores = pos_scores.reshape(num_heads, batch_size, time1,
+                                                seq_len)
+            else:
+                pos_scores = pos_scores.as_strided(
+                    (num_heads, batch_size, seq_len, seq_len),
+                    (
+                        pos_scores.stride(0),
+                        pos_scores.stride(1),
+                        pos_scores.stride(2) - pos_scores.stride(3),
+                        pos_scores.stride(3),
+                    ),
+                    storage_offset=pos_scores.stride(3) * (seq_len - 1),
+                )
+            # print(attn_scores.shape, pos_scores.shape)
+            if self.training:
+                attn_scores = attn_scores + pos_scores
+            else:
+                # inplace operator important
+                attn_scores.add_(pos_scores)
+                # attn_scores = attn_scores + pos_scores
+
+        if torch.jit.is_scripting() or torch.jit.is_tracing():
+            pass
+        elif self.training and random.random() < 0.1:
+            # This is a harder way of limiting the attention scores to not be
+            # too large.  It incurs a penalty if any of them has an absolute
+            # value greater than 50.0.  this should be outside the normal range
+            # of the attention scores.  We use this mechanism instead of, say,
+            # something added to the loss function involving the entropy,
+            # because once the entropy gets very small gradients through the
+            # softmax can become very small, and we'd get zero derivatives.  The
+            # choices of 1.0e-04 as the scale on the penalty makes this
+            # mechanism vulnerable to the absolute scale of the loss function,
+            # but we view this as a failsafe to avoid "implausible" parameter
+            # values rather than a regularization method that should be active
+            # under normal circumstances.
+            attn_scores = penalize_abs_values_gt(
+                attn_scores, limit=25.0, penalty=1.0e-04, name=self.name)
+
+        assert attn_scores.shape == (num_heads, batch_size, seq_len, seq_len)
+
+        if attn_mask is not None:
+            assert attn_mask.dtype == torch.bool
+            # use -1000 to avoid nan's where attn_mask and key_padding_mask make
+            # all scores zero.  It's important that this be large enough that exp(-1000)
+            # is exactly zero, for reasons related to const_attention_rate, it
+            # compares the final weights with zero.
+            attn_scores = attn_scores.masked_fill(attn_mask, -1000)
+
+        if key_padding_mask is not None:
+            assert key_padding_mask.shape == (
+                batch_size,
+                seq_len,
+            ), key_padding_mask.shape
+            attn_scores = attn_scores.masked_fill(
+                key_padding_mask.unsqueeze(1),
+                -1000,
+            )
+
+        # We use our own version of softmax, defined in scaling.py, which should
+        # save a little of the memory used in backprop by, if we are in
+        # automatic mixed precision mode (amp / autocast), by only storing the
+        # half-precision output for backprop purposes.
+        attn_weights = softmax(attn_scores, dim=-1)
+
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.dropout, training=self.training)
+
+        return attn_weights
+
+
+class SelfAttention(nn.Module):
+    """
+    The simplest possible attention module.  This one works with already-computed attention
+    weights, e.g. as computed by RelPositionMultiheadAttentionWeights.
+
+    Args:
+          embed_dim: the input and output embedding dimension
+          num_heads: the number of attention heads
+          value_head_dim: the value dimension per head
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        value_head_dim: int,
+    ) -> None:
+        super().__init__()
+        self.in_proj = nn.Linear(
+            embed_dim, num_heads * value_head_dim, bias=True)
+
+        self.out_proj = ScaledLinear(
+            num_heads * value_head_dim,
+            embed_dim,
+            bias=True,
+            initial_scale=0.05)
+
+        self.whiten = Identity()
+
+    def forward(
+        self,
+        x: Tensor,
+        attn_weights: Tensor,
+    ) -> Tensor:
+        """
+        Args:
+          x: input tensor, of shape (seq_len, batch_size, embed_dim)
+         attn_weights: a tensor of shape (num_heads, batch_size, seq_len, seq_len),
+          with seq_len being interpreted as (tgt_seq_len, src_seq_len).  Expect
+          attn_weights.sum(dim=-1) == 1.
+        Returns:
+           a tensor with the same shape as x.
+        """
+        (seq_len, batch_size, embed_dim) = x.shape
+        num_heads = attn_weights.shape[0]
+        assert attn_weights.shape == (num_heads, batch_size, seq_len, seq_len)
+
+        x = self.in_proj(
+            x)  # (seq_len, batch_size, num_heads * value_head_dim)
+        x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
+        # now x: (num_heads, batch_size, seq_len, value_head_dim)
+        value_head_dim = x.shape[-1]
+
+        # todo: see whether there is benefit in overriding matmul
+        # print(f"SelfAttetion pos {attn_weights.shape} {x.shape}")
+        x = torch.matmul(attn_weights, x)
+        # v: (num_heads, batch_size, seq_len, value_head_dim)
+
+        x = (
+            x.permute(2, 1, 0,
+                      3).contiguous().view(seq_len, batch_size,
+                                           num_heads * value_head_dim))
+
+        # returned value is of shape (seq_len, batch_size, embed_dim), like the input.
+        x = self.out_proj(x)
+        x = self.whiten(x)
+
+        return x
+
+
+class FeedforwardModule(nn.Module):
+    """Feedforward module in Zipformer2 model."""
+
+    def __init__(self, embed_dim: int, feedforward_dim: int,
+                 dropout: FloatLike):
+        super(FeedforwardModule, self).__init__()
+        self.in_proj = nn.Linear(embed_dim, feedforward_dim)
+
+        self.hidden_balancer = Identity()
+
+        # shared_dim=0 means we share the dropout mask along the time axis
+        self.out_proj = ActivationDropoutAndLinear(
+            feedforward_dim,
+            embed_dim,
+            activation='SwooshL',
+            dropout_p=dropout,
+            dropout_shared_dim=0,
+            bias=True,
+            initial_scale=0.1,
+        )
+
+        self.out_whiten = Identity()
+
+    def forward(self, x: Tensor):
+        x = self.in_proj(x)
+        x = self.hidden_balancer(x)
+        # out_proj contains SwooshL activation, then dropout, then linear.
+        x = self.out_proj(x)
+        x = self.out_whiten(x)
+        return x
+
+
+class NonlinAttention(nn.Module):
+    """This is like the ConvolutionModule, but refactored so that we use multiplication by attention weights (borrowed
+       from the attention module) in place of actual convolution.  We also took out the second nonlinearity, the
+       one after the attention mechanism.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        hidden_channels: int,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_channels = hidden_channels
+
+        self.in_proj = nn.Linear(channels, hidden_channels * 3, bias=True)
+
+        self.balancer = Identity()
+        self.tanh = nn.Tanh()
+
+        self.identity1 = Identity()  # for diagnostics.
+        self.identity2 = Identity()  # for diagnostics.
+        self.identity3 = Identity()  # for diagnostics.
+
+        self.out_proj = ScaledLinear(
+            hidden_channels, channels, bias=True, initial_scale=0.05)
+
+        self.whiten1 = Identity()
+        self.whiten2 = Identity()
+
+    def forward(
+        self,
+        x: Tensor,
+        attn_weights: Tensor,
+    ) -> Tensor:
+        """.
+                Args:
+                   x: a Tensor of shape (seq_len, batch_size, num_channels)
+        attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len)
+                Returns:
+                   a Tensor with the same shape as x
+        """
+        x = self.in_proj(x)
+
+        (seq_len, batch_size, _) = x.shape
+        hidden_channels = self.hidden_channels
+
+        s, x, y = x.chunk(3, dim=2)
+
+        # s will go through tanh.
+
+        s = self.balancer(s)
+        s = self.tanh(s)
+
+        s = s.unsqueeze(-1).reshape(seq_len, batch_size, hidden_channels)
+        x = self.whiten1(x)
+        x = x * s
+        x = self.identity1(x)  # diagnostics only, it's the identity.
+
+        (seq_len, batch_size, embed_dim) = x.shape
+        num_heads = attn_weights.shape[0]
+        assert attn_weights.shape == (num_heads, batch_size, seq_len, seq_len)
+
+        x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3)
+        # now x: (num_heads, batch_size, seq_len, head_dim)
+        # print(f"nonlinattion {attn_weights.shape} {x.shape}")
+        x = torch.matmul(attn_weights, x)
+        # now x: (num_heads, batch_size, seq_len, head_dim)
+        x = x.permute(2, 1, 0, 3).reshape(seq_len, batch_size, -1)
+
+        y = self.identity2(y)
+        x = x * y
+        x = self.identity3(x)
+
+        x = self.out_proj(x)
+        x = self.whiten2(x)
+        return x
+
+
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Zipformer2 model.
+    Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/zipformer/convolution.py
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
+        bias (bool): Whether to use bias in conv layers (default=True).
+
+    """
+
+    def __init__(
+        self,
+        channels: int,
+        kernel_size: int,
+        causal: bool,
+    ) -> None:
+        """Construct a ConvolutionModule object."""
+        super(ConvolutionModule, self).__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        bottleneck_dim = channels
+        self.causal = causal
+
+        self.in_proj = nn.Linear(
+            channels,
+            2 * bottleneck_dim,
+        )
+        # the gradients on in_proj are a little noisy, likely to do with the
+        # sigmoid in glu.
+
+        self.balancer1 = Identity()
+
+        self.activation1 = Identity()  # for diagnostics
+
+        self.sigmoid = nn.Sigmoid()
+
+        self.activation2 = Identity()  # for diagnostics
+
+        assert kernel_size % 2 == 1
+
+        self.depthwise_conv = (
+            ChunkCausalDepthwiseConv1d(
+                channels=bottleneck_dim, kernel_size=kernel_size)
+            if causal else nn.Conv1d(
+                in_channels=bottleneck_dim,
+                out_channels=bottleneck_dim,
+                groups=bottleneck_dim,
+                kernel_size=kernel_size,
+                padding=kernel_size // 2,
+            ))
+
+        self.balancer2 = Identity()
+
+        self.whiten = Identity()
+
+        self.out_proj = ActivationDropoutAndLinear(
+            bottleneck_dim,
+            channels,
+            activation='SwooshR',
+            dropout_p=0.0,
+            initial_scale=0.05,
+        )
+
+    def forward(
+        self,
+        x: Tensor,
+        src_key_padding_mask: Optional[Tensor] = None,
+        chunk_size: int = -1,
+    ) -> Tensor:
+        """Compute convolution module.
+
+        Args:
+            x: Input tensor (#time, batch, channels).
+           src_key_padding_mask: the mask for the src keys per batch (optional):
+               (batch, #time), contains True in masked positions.
+
+        Returns:
+            Tensor: Output tensor (#time, batch, channels).
+
+        """
+
+        x = self.in_proj(x)  # (time, batch, 2*channels)
+
+        x, s = x.chunk(2, dim=2)
+        s = self.balancer1(s)
+        s = self.sigmoid(s)
+        x = self.activation1(x)  # identity.
+        x = x * s
+        x = self.activation2(x)  # identity
+
+        # (time, batch, channels)
+
+        # exchange the temporal dimension and the feature dimension
+        x = x.permute(1, 2, 0)  # (#batch, channels, time).
+
+        if src_key_padding_mask is not None:
+            x = x.masked_fill(
+                src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0)
+
+        if (not torch.jit.is_scripting() and not torch.jit.is_tracing()
+                and chunk_size >= 0):
+            # Not support exporting a model for simulated streaming decoding
+            assert (
+                self.causal
+            ), 'Must initialize model with causal=True if you use chunk_size'
+            x = self.depthwise_conv(x, chunk_size=chunk_size)
+        else:
+            # with record_function("depthwise_conv"):
+            x = self.depthwise_conv(x)
+            # pass
+
+        x = self.balancer2(x)
+        x = x.permute(2, 0, 1)  # (time, batch, channels)
+
+        x = self.whiten(x)  # (time, batch, channels)
+        x = self.out_proj(x)  # (time, batch, channels)
+
+        return x
+
+
+if __name__ == '__main__':
+    logging.getLogger().setLevel(logging.INFO)
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
deleted file mode 100644
index 5e02076ee..000000000
--- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-
-import os
-from typing import Any, Dict
-
-from modelscope.metainfo import Models
-from modelscope.models.base import Model
-from modelscope.models.builder import MODELS
-from modelscope.utils.constant import Frameworks, Tasks
-
-__all__ = ['GenericAutomaticSpeechRecognition']
-
-
-@MODELS.register_module(
-    Tasks.auto_speech_recognition, module_name=Models.generic_asr)
-@MODELS.register_module(
-    Tasks.voice_activity_detection, module_name=Models.generic_asr)
-@MODELS.register_module(
-    Tasks.speech_separation, module_name=Models.generic_asr)
-@MODELS.register_module(
-    Tasks.language_score_prediction, module_name=Models.generic_asr)
-@MODELS.register_module(Tasks.speech_timestamp, module_name=Models.generic_asr)
-class GenericAutomaticSpeechRecognition(Model):
-
-    def __init__(self, model_dir: str, am_model_name: str,
-                 model_config: Dict[str, Any], *args, **kwargs):
-        """initialize the info of model.
-
-        Args:
-            model_dir (str): the model path.
-            am_model_name (str): the am model name from configuration.json
-            model_config (Dict[str, Any]): the detail config about model from configuration.json
-        """
-        super().__init__(model_dir, am_model_name, model_config, *args,
-                         **kwargs)
-        self.model_cfg = {
-            # the recognition model dir path
-            'model_workspace': model_dir,
-            # the am model name
-            'am_model': am_model_name,
-            # the am model file path
-            'am_model_path': os.path.join(model_dir, am_model_name),
-            # the recognition model config dict
-            'model_config': model_config
-        }
-
-    def forward(self) -> Dict[str, Any]:
-        """preload model and return the info of the model
-        """
-
-        return self.model_cfg
diff --git a/tests/metrics/__init__.py b/modelscope/models/audio/funasr/__init__.py
similarity index 100%
rename from tests/metrics/__init__.py
rename to modelscope/models/audio/funasr/__init__.py
diff --git a/modelscope/models/audio/funasr/model.py b/modelscope/models/audio/funasr/model.py
new file mode 100644
index 000000000..73ffc6189
--- /dev/null
+++ b/modelscope/models/audio/funasr/model.py
@@ -0,0 +1,62 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+import json
+from funasr import AutoModel
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Model
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Frameworks, Tasks
+
+__all__ = ['GenericFunASR']
+
+
+@MODELS.register_module(
+    Tasks.auto_speech_recognition, module_name=Models.funasr)
+@MODELS.register_module(
+    Tasks.voice_activity_detection, module_name=Models.funasr)
+@MODELS.register_module(
+    Tasks.language_score_prediction, module_name=Models.funasr)
+@MODELS.register_module(Tasks.punctuation, module_name=Models.funasr)
+@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.funasr)
+@MODELS.register_module(Tasks.speaker_verification, module_name=Models.funasr)
+@MODELS.register_module(Tasks.speech_separation, module_name=Models.funasr)
+@MODELS.register_module(Tasks.speech_timestamp, module_name=Models.funasr)
+@MODELS.register_module(Tasks.emotion_recognition, module_name=Models.funasr)
+class GenericFunASR(Model):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        """initialize the info of model.
+
+        Args:
+            model_dir (str): the model path.
+            am_model_name (str): the am model name from configuration.json
+            model_config (Dict[str, Any]): the detail config about model from configuration.json
+        """
+        super().__init__(model_dir, *args, **kwargs)
+        model_cfg = json.loads(
+            open(os.path.join(model_dir, 'configuration.json')).read())
+        if 'vad_model' not in kwargs and 'vad_model' in model_cfg:
+            kwargs['vad_model'] = model_cfg['vad_model']
+            kwargs['vad_model_revision'] = model_cfg.get(
+                'vad_model_revision', None)
+        if 'punc_model' not in kwargs and 'punc_model' in model_cfg:
+            kwargs['punc_model'] = model_cfg['punc_model']
+            kwargs['punc_model_revision'] = model_cfg.get(
+                'punc_model_revision', None)
+        if 'spk_model' not in kwargs and 'spk_model' in model_cfg:
+            kwargs['spk_model'] = model_cfg['spk_model']
+            kwargs['spk_model_revision'] = model_cfg.get(
+                'spk_model_revision', None)
+
+        self.model = AutoModel(model=model_dir, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        """preload model and return the info of the model
+        """
+
+        output = self.model.generate(*args, **kwargs)
+        return output
diff --git a/modelscope/models/audio/quantization/__init__.py b/modelscope/models/audio/quantization/__init__.py
new file mode 100644
index 000000000..4952a0765
--- /dev/null
+++ b/modelscope/models/audio/quantization/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .generic_audio_quantization import GenericAudioQuantization
+
+else:
+    _import_structure = {
+        'generic_audio_quantization': ['GenericAudioQuantization'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/audio/sv/generic_speaker_verification.py b/modelscope/models/audio/quantization/generic_audio_quantization.py
similarity index 87%
rename from modelscope/models/audio/sv/generic_speaker_verification.py
rename to modelscope/models/audio/quantization/generic_audio_quantization.py
index 788ccf7c7..2967cd3c2 100644
--- a/modelscope/models/audio/sv/generic_speaker_verification.py
+++ b/modelscope/models/audio/quantization/generic_audio_quantization.py
@@ -8,12 +8,12 @@
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Frameworks, Tasks
 
+__all__ = ['GenericAudioQuantization']
+
 
 @MODELS.register_module(
-    Tasks.speaker_verification, module_name=Models.generic_sv)
-@MODELS.register_module(
-    Tasks.speaker_diarization, module_name=Models.generic_sv)
-class SpeakerVerification(Model):
+    Tasks.audio_quantization, module_name=Models.audio_quantization)
+class GenericAudioQuantization(Model):
 
     def __init__(self, model_dir: str, model_name: str,
                  model_config: Dict[str, Any], *args, **kwargs):
diff --git a/modelscope/models/audio/sv/ERes2Net.py b/modelscope/models/audio/sv/ERes2Net.py
index 0119783c3..0d4a81374 100644
--- a/modelscope/models/audio/sv/ERes2Net.py
+++ b/modelscope/models/audio/sv/ERes2Net.py
@@ -19,6 +19,7 @@
 from modelscope.models import MODELS, TorchModel
 from modelscope.models.audio.sv.fusion import AFF
 from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
 
 
 class ReLU(nn.Hardtanh):
@@ -54,11 +55,11 @@ def conv3x3(in_planes, out_planes, stride=1):
         bias=False)
 
 
-class BasicBlockRes2Net(nn.Module):
+class BasicBlockERes2Net(nn.Module):
     expansion = 2
 
     def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
-        super(BasicBlockRes2Net, self).__init__()
+        super(BasicBlockERes2Net, self).__init__()
         width = int(math.floor(planes * (baseWidth / 64.0)))
         self.conv1 = conv1x1(in_planes, width * scale, stride)
         self.bn1 = nn.BatchNorm2d(width * scale)
@@ -117,11 +118,11 @@ def forward(self, x):
         return out
 
 
-class BasicBlockRes2Net_diff_AFF(nn.Module):
+class BasicBlockERes2Net_AFF(nn.Module):
     expansion = 2
 
     def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
-        super(BasicBlockRes2Net_diff_AFF, self).__init__()
+        super(BasicBlockERes2Net_AFF, self).__init__()
         width = int(math.floor(planes * (baseWidth / 64.0)))
         self.conv1 = conv1x1(in_planes, width * scale, stride)
         self.bn1 = nn.BatchNorm2d(width * scale)
@@ -189,8 +190,8 @@ def forward(self, x):
 class ERes2Net(nn.Module):
 
     def __init__(self,
-                 block=BasicBlockRes2Net,
-                 block_fuse=BasicBlockRes2Net_diff_AFF,
+                 block=BasicBlockERes2Net,
+                 block_fuse=BasicBlockERes2Net_AFF,
                  num_blocks=[3, 4, 6, 3],
                  m_channels=32,
                  feat_dim=80,
@@ -314,6 +315,7 @@ def __init__(self, model_dir, model_config: Dict[str, Any], *args,
         self.m_channels = self.model_config['channels']
         self.other_config = kwargs
         self.feature_dim = 80
+        self.device = create_device(self.other_config['device'])
 
         self.embedding_model = ERes2Net(
             embed_dim=self.embed_dim, m_channels=self.m_channels)
@@ -321,6 +323,7 @@ def __init__(self, model_dir, model_config: Dict[str, Any], *args,
         pretrained_model_name = kwargs['pretrained_model']
         self.__load_check_point(pretrained_model_name)
 
+        self.embedding_model.to(self.device)
         self.embedding_model.eval()
 
     def forward(self, audio):
@@ -333,7 +336,7 @@ def forward(self, audio):
         ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
         # audio shape: [N, T]
         feature = self.__extract_feature(audio)
-        embedding = self.embedding_model(feature)
+        embedding = self.embedding_model(feature.to(self.device))
 
         return embedding.detach().cpu()
 
diff --git a/modelscope/models/audio/sv/ERes2NetV2.py b/modelscope/models/audio/sv/ERes2NetV2.py
new file mode 100644
index 000000000..d842a0948
--- /dev/null
+++ b/modelscope/models/audio/sv/ERes2NetV2.py
@@ -0,0 +1,345 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+    To further improve the short-duration feature extraction capability of ERes2Net,
+    we expand the channel dimension within each stage. However, this modification also
+    increases the number of model parameters and computational complexity.
+    To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures,
+    ultimately reducing both the model parameters and its computational cost.
+"""
+
+import math
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+import modelscope.models.audio.sv.pooling_layers as pooling_layers
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.models.audio.sv.fusion import AFF
+from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
+
+
+class ReLU(nn.Hardtanh):
+
+    def __init__(self, inplace=False):
+        super(ReLU, self).__init__(0, 20, inplace)
+
+    def __repr__(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return self.__class__.__name__ + ' (' \
+            + inplace_str + ')'
+
+
+class BasicBlockERes2NetV2(nn.Module):
+
+    def __init__(self,
+                 in_planes,
+                 planes,
+                 stride=1,
+                 baseWidth=26,
+                 scale=2,
+                 expansion=2):
+        super(BasicBlockERes2NetV2, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.width = width
+        self.conv1 = nn.Conv2d(
+            in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale
+        self.expansion = expansion
+
+        convs = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(
+                nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+            bns.append(nn.BatchNorm2d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.relu = ReLU(inplace=True)
+
+        self.conv3 = nn.Conv2d(
+            width * scale, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes,
+                    self.expansion * planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False), nn.BatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BasicBlockERes2NetV2AFF(nn.Module):
+
+    def __init__(self,
+                 in_planes,
+                 planes,
+                 stride=1,
+                 baseWidth=26,
+                 scale=2,
+                 expansion=2):
+        super(BasicBlockERes2NetV2AFF, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.width = width
+        self.conv1 = nn.Conv2d(
+            in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale
+        self.expansion = expansion
+
+        convs = []
+        fuse_models = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(
+                nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+            bns.append(nn.BatchNorm2d(width))
+        for j in range(self.nums - 1):
+            fuse_models.append(AFF(channels=width, r=4))
+
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.fuse_models = nn.ModuleList(fuse_models)
+        self.relu = ReLU(inplace=True)
+
+        self.conv3 = nn.Conv2d(
+            width * scale, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes,
+                    self.expansion * planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False), nn.BatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = self.fuse_models[i - 1](sp, spx[i])
+
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ERes2NetV2(nn.Module):
+
+    def __init__(self,
+                 block=BasicBlockERes2NetV2,
+                 block_fuse=BasicBlockERes2NetV2AFF,
+                 num_blocks=[3, 4, 6, 3],
+                 m_channels=64,
+                 feat_dim=80,
+                 embed_dim=192,
+                 baseWidth=26,
+                 scale=2,
+                 expansion=2,
+                 pooling_func='TSTP',
+                 two_emb_layer=False):
+        super(ERes2NetV2, self).__init__()
+        self.in_planes = m_channels
+        self.feat_dim = feat_dim
+        self.embed_dim = embed_dim
+        self.stats_dim = int(feat_dim / 8) * m_channels * 8
+        self.two_emb_layer = two_emb_layer
+        self.baseWidth = baseWidth
+        self.scale = scale
+        self.expansion = expansion
+
+        self.conv1 = nn.Conv2d(
+            1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(m_channels)
+        self.layer1 = self._make_layer(
+            block, m_channels, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(
+            block, m_channels * 2, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(
+            block_fuse, m_channels * 4, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(
+            block_fuse, m_channels * 8, num_blocks[3], stride=2)
+
+        # Downsampling module
+        self.layer3_ds = nn.Conv2d(
+            m_channels * 4 * self.expansion,
+            m_channels * 8 * self.expansion,
+            kernel_size=3,
+            padding=1,
+            stride=2,
+            bias=False)
+
+        # Bottom-up fusion module
+        self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4)
+
+        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=self.stats_dim * self.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats,
+                               embed_dim)
+        if self.two_emb_layer:
+            self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False)
+            self.seg_2 = nn.Linear(embed_dim, embed_dim)
+        else:
+            self.seg_bn_1 = nn.Identity()
+            self.seg_2 = nn.Identity()
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(
+                block(
+                    self.in_planes,
+                    planes,
+                    stride,
+                    baseWidth=self.baseWidth,
+                    scale=self.scale,
+                    expansion=self.expansion))
+            self.in_planes = planes * self.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out1 = self.layer1(out)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out4 = self.layer4(out3)
+        out3_ds = self.layer3_ds(out3)
+        fuse_out34 = self.fuse34(out4, out3_ds)
+        stats = self.pool(fuse_out34)
+
+        embed_a = self.seg_1(stats)
+        if self.two_emb_layer:
+            out = F.relu(embed_a)
+            out = self.seg_bn_1(out)
+            embed_b = self.seg_2(out)
+            return embed_b
+        else:
+            return embed_a
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.eres2netv2_sv)
+class SpeakerVerificationERes2NetV2(TorchModel):
+    r"""ERes2NetV2 architecture with local and global feature fusion. ERes2NetV2 is mainly composed
+    of Bottom-up Dual-stage Feature Fusion (BDFF) and Bottleneck-like Local Feature Fusion (BLFF).
+    BDFF fuses multi-scale feature maps in bottom-up pathway to obtain global information.
+    The BLFF extracts localization-preserved speaker features and strengthen the local information interaction.
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.embed_dim = self.model_config['embed_dim']
+        self.baseWidth = self.model_config['baseWidth']
+        self.scale = self.model_config['scale']
+        self.expansion = self.model_config['expansion']
+        self.other_config = kwargs
+        self.feature_dim = 80
+        self.device = create_device(self.other_config['device'])
+
+        self.embedding_model = ERes2NetV2(
+            embed_dim=self.embed_dim,
+            baseWidth=self.baseWidth,
+            scale=self.scale,
+            expansion=self.expansion)
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.to(self.device)
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)
+        assert len(
+            audio.shape
+        ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
+        # audio shape: [N, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model(feature.to(self.device))
+
+        return embedding.detach().cpu()
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        self.embedding_model.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_model_name),
+                map_location=device),
+            strict=True)
diff --git a/modelscope/models/audio/sv/ERes2Net_aug.py b/modelscope/models/audio/sv/ERes2Net_aug.py
index d0739cad2..5540ff3ef 100644
--- a/modelscope/models/audio/sv/ERes2Net_aug.py
+++ b/modelscope/models/audio/sv/ERes2Net_aug.py
@@ -19,6 +19,7 @@
 from modelscope.models import MODELS, TorchModel
 from modelscope.models.audio.sv.fusion import AFF
 from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
 
 
 class ReLU(nn.Hardtanh):
@@ -308,12 +309,13 @@ def __init__(self, model_dir, model_config: Dict[str, Any], *args,
         self.model_config = model_config
         self.other_config = kwargs
         self.feature_dim = 80
-
+        self.device = create_device(self.other_config['device'])
         self.embedding_model = ERes2Net_aug()
 
         pretrained_model_name = kwargs['pretrained_model']
         self.__load_check_point(pretrained_model_name)
 
+        self.embedding_model.to(self.device)
         self.embedding_model.eval()
 
     def forward(self, audio):
@@ -326,7 +328,7 @@ def forward(self, audio):
         ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
         # audio shape: [N, T]
         feature = self.__extract_feature(audio)
-        embedding = self.embedding_model(feature)
+        embedding = self.embedding_model(feature.to(self.device))
 
         return embedding.detach().cpu()
 
diff --git a/modelscope/models/audio/sv/Res2Net.py b/modelscope/models/audio/sv/Res2Net.py
new file mode 100644
index 000000000..0d26e6014
--- /dev/null
+++ b/modelscope/models/audio/sv/Res2Net.py
@@ -0,0 +1,234 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" Res2Net implementation is adapted from https://github.com/Res2Net/Res2Net-PretrainedModels.
+    Res2Net is an advanced neural network architecture that enhances the capabilities of standard ResNets
+    by incorporating hierarchical residual-like connections. This innovative structure improves
+    performance across various computer vision tasks, such as image classification and object
+    detection, without significant computational overhead.
+    Reference: https://arxiv.org/pdf/1904.01169.pdf
+    Some modifications from the original architecture:
+    1. Smaller kernel size for the input layer
+    2. Smaller expansion in BasicBlockRes2Net
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+import modelscope.models.audio.sv.pooling_layers as pooling_layers
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
+
+
+class ReLU(nn.Hardtanh):
+
+    def __init__(self, inplace=False):
+        super(ReLU, self).__init__(0, 20, inplace)
+
+    def __repr__(self):
+        inplace_str = 'inplace' if self.inplace else ''
+        return self.__class__.__name__ + ' (' \
+            + inplace_str + ')'
+
+
+class BasicBlockRes2Net(nn.Module):
+    expansion = 2
+
+    def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2):
+        super(BasicBlockRes2Net, self).__init__()
+        width = int(math.floor(planes * (baseWidth / 64.0)))
+        self.conv1 = nn.Conv2d(
+            in_planes, width * scale, kernel_size=1, stride=stride, bias=False)
+        self.bn1 = nn.BatchNorm2d(width * scale)
+        self.nums = scale - 1
+        convs = []
+        bns = []
+        for i in range(self.nums):
+            convs.append(
+                nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False))
+            bns.append(nn.BatchNorm2d(width))
+        self.convs = nn.ModuleList(convs)
+        self.bns = nn.ModuleList(bns)
+        self.relu = ReLU(inplace=True)
+
+        self.conv3 = nn.Conv2d(
+            width * scale, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes,
+                    self.expansion * planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False), nn.BatchNorm2d(self.expansion * planes))
+        self.stride = stride
+        self.width = width
+        self.scale = scale
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+        spx = torch.split(out, self.width, 1)
+        for i in range(self.nums):
+            if i == 0:
+                sp = spx[i]
+            else:
+                sp = sp + spx[i]
+            sp = self.convs[i](sp)
+            sp = self.relu(self.bns[i](sp))
+            if i == 0:
+                out = sp
+            else:
+                out = torch.cat((out, sp), 1)
+
+        out = torch.cat((out, spx[self.nums]), 1)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        residual = self.shortcut(x)
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Res2Net(nn.Module):
+
+    def __init__(self,
+                 block=BasicBlockRes2Net,
+                 num_blocks=[3, 4, 6, 3],
+                 m_channels=32,
+                 feat_dim=80,
+                 embedding_size=192,
+                 pooling_func='TSTP',
+                 two_emb_layer=False):
+        super(Res2Net, self).__init__()
+        self.in_planes = m_channels
+        self.feat_dim = feat_dim
+        self.embedding_size = embedding_size
+        self.stats_dim = int(feat_dim / 8) * m_channels * 8
+        self.two_emb_layer = two_emb_layer
+
+        self.conv1 = nn.Conv2d(
+            1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(m_channels)
+
+        self.layer1 = self._make_layer(
+            block, m_channels, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(
+            block, m_channels * 2, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(
+            block, m_channels * 4, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(
+            block, m_channels * 8, num_blocks[3], stride=2)
+
+        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=self.stats_dim * block.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
+                               embedding_size)
+        if self.two_emb_layer:
+            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
+            self.seg_2 = nn.Linear(embedding_size, embedding_size)
+        else:
+            self.seg_bn_1 = nn.Identity()
+            self.seg_2 = nn.Identity()
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.layer1(out)
+        out = self.layer2(out)
+        out = self.layer3(out)
+        out = self.layer4(out)
+
+        stats = self.pool(out)
+
+        embed_a = self.seg_1(stats)
+        if self.two_emb_layer:
+            out = F.relu(embed_a)
+            out = self.seg_bn_1(out)
+            embed_b = self.seg_2(out)
+            return embed_b
+        else:
+            return embed_a
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.res2net_sv)
+class SpeakerVerificationResNet(TorchModel):
+    r"""
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.embed_dim = self.model_config['embed_dim']
+        self.m_channels = self.model_config['channels']
+        self.other_config = kwargs
+        self.feature_dim = 80
+        self.device = create_device(self.other_config['device'])
+
+        self.embedding_model = Res2Net(
+            embedding_size=self.embed_dim, m_channels=self.m_channels)
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.to(self.device)
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)
+        assert len(
+            audio.shape
+        ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
+        # audio shape: [N, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model(feature.to(self.device))
+
+        return embedding.detach().cpu()
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        self.embedding_model.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_model_name),
+                map_location=device),
+            strict=True)
diff --git a/modelscope/models/audio/sv/ResNet.py b/modelscope/models/audio/sv/ResNet.py
new file mode 100644
index 000000000..94d303b56
--- /dev/null
+++ b/modelscope/models/audio/sv/ResNet.py
@@ -0,0 +1,186 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" ResNet implementation is adapted from https://github.com/wenet-e2e/wespeaker.
+    ResNet, or Residual Neural Network, is notable for its optimization ease
+    and depth-induced accuracy gains. It utilizes skip connections within its residual
+    blocks to counteract the vanishing gradient problem in deep networks.
+    Reference: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun
+    Deep Residual Learning for Image Recognition. arXiv:1512.03385
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+import modelscope.models.audio.sv.pooling_layers as pooling_layers
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, in_planes, planes, stride=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            in_planes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(
+            planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(
+                    in_planes,
+                    self.expansion * planes,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False), nn.BatchNorm2d(self.expansion * planes))
+
+    def forward(self, x):
+        out = F.relu(self.bn1(self.conv1(x)))
+        out = self.bn2(self.conv2(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block=BasicBlock,
+                 num_blocks=[3, 4, 6, 3],
+                 m_channels=32,
+                 feat_dim=80,
+                 embedding_size=128,
+                 pooling_func='TSTP',
+                 two_emb_layer=True):
+        super(ResNet, self).__init__()
+        self.in_planes = m_channels
+        self.feat_dim = feat_dim
+        self.embedding_size = embedding_size
+        self.stats_dim = int(feat_dim / 8) * m_channels * 8
+        self.two_emb_layer = two_emb_layer
+
+        self.conv1 = nn.Conv2d(
+            1, m_channels, kernel_size=3, stride=1, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(m_channels)
+
+        self.layer1 = self._make_layer(
+            block, m_channels, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(
+            block, m_channels * 2, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(
+            block, m_channels * 4, num_blocks[2], stride=2)
+        self.layer4 = self._make_layer(
+            block, m_channels * 8, num_blocks[3], stride=2)
+
+        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=self.stats_dim * block.expansion)
+        self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats,
+                               embedding_size)
+        if self.two_emb_layer:
+            self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False)
+            self.seg_2 = nn.Linear(embedding_size, embedding_size)
+        else:
+            self.seg_bn_1 = nn.Identity()
+            self.seg_2 = nn.Identity()
+
+    def _make_layer(self, block, planes, num_blocks, stride):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for stride in strides:
+            layers.append(block(self.in_planes, planes, stride))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) => (B,F,T)
+        x = x.unsqueeze_(1)
+        out = F.relu(self.bn1(self.conv1(x)))
+        out1 = self.layer1(out)
+        out2 = self.layer2(out1)
+        out3 = self.layer3(out2)
+        out = self.layer4(out3)
+        stats = self.pool(out)
+
+        embed_a = self.seg_1(stats)
+        if self.two_emb_layer:
+            out = F.relu(embed_a)
+            out = self.seg_bn_1(out)
+            embed_b = self.seg_2(out)
+            return embed_b
+        else:
+            return embed_a
+
+
+@MODELS.register_module(
+    Tasks.speaker_verification, module_name=Models.resnet_sv)
+class SpeakerVerificationResNet(TorchModel):
+    r"""
+    Args:
+        model_dir: A model dir.
+        model_config: The model config.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.embed_dim = self.model_config['embed_dim']
+        self.m_channels = self.model_config['channels']
+        self.other_config = kwargs
+        self.feature_dim = 80
+        self.device = create_device(self.other_config['device'])
+
+        self.embedding_model = ResNet(
+            embedding_size=self.embed_dim, m_channels=self.m_channels)
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.to(self.device)
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)
+        assert len(
+            audio.shape
+        ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
+        # audio shape: [N, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model(feature.to(self.device))
+
+        return embedding.detach().cpu()
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        self.embedding_model.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_model_name),
+                map_location=device),
+            strict=True)
diff --git a/modelscope/models/audio/sv/lanuage_recognition_eres2net.py b/modelscope/models/audio/sv/lanuage_recognition_eres2net.py
index 0876cd2e5..927d9b00f 100644
--- a/modelscope/models/audio/sv/lanuage_recognition_eres2net.py
+++ b/modelscope/models/audio/sv/lanuage_recognition_eres2net.py
@@ -92,9 +92,9 @@ def forward(self, audio):
         # audio shape: [N, T]
         feature = self._extract_feature(audio)
         embs = self.encoder(feature.to(self.device))
-        output = self.backend(embs)
-        output = output.detach().cpu().argmax(-1)
-        return output
+        scores = self.backend(embs).detach()
+        output = scores.cpu().argmax(-1)
+        return scores, output
 
     def _extract_feature(self, audio):
         features = []
diff --git a/modelscope/models/audio/sv/lanuage_recognition_model.py b/modelscope/models/audio/sv/lanuage_recognition_model.py
index 3ab531282..1f7da7605 100644
--- a/modelscope/models/audio/sv/lanuage_recognition_model.py
+++ b/modelscope/models/audio/sv/lanuage_recognition_model.py
@@ -89,9 +89,9 @@ def forward(self, audio):
         # audio shape: [N, T]
         feature = self._extract_feature(audio)
         embs = self.encoder(feature.to(self.device))
-        output = self.backend(embs)
-        output = output.detach().cpu().argmax(-1)
-        return output
+        scores = self.backend(embs).detach()
+        output = scores.cpu().argmax(-1)
+        return scores, output
 
     def _extract_feature(self, audio):
         features = []
diff --git a/modelscope/models/audio/sv/sdpn.py b/modelscope/models/audio/sv/sdpn.py
new file mode 100644
index 000000000..2c279e9d7
--- /dev/null
+++ b/modelscope/models/audio/sv/sdpn.py
@@ -0,0 +1,614 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain.
+    Self-Distillation Prototypes Network(SDPN) is a self-supervised learning framwork in SV.
+    It comprises a teacher and a student network with identical architecture
+    but different parameters. Teacher/student network consists of three main modules:
+    the encoder for extracting speaker embeddings, multi-layer perceptron for
+    feature transformation, and prototypes for computing soft-distributions between
+    global and local views. EMA denotes Exponential Moving Average.
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.utils.constant import Tasks
+
+
+def length_to_mask(length, max_len=None, dtype=None, device=None):
+    assert len(length.shape) == 1
+
+    if max_len is None:
+        max_len = length.max().long().item()
+    mask = torch.arange(
+        max_len, device=length.device, dtype=length.dtype).expand(
+            len(length), max_len) < length.unsqueeze(1)
+
+    if dtype is None:
+        dtype = length.dtype
+
+    if device is None:
+        device = length.device
+
+    mask = torch.as_tensor(mask, dtype=dtype, device=device)
+    return mask
+
+
+def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int):
+    if stride > 1:
+        n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
+        L_out = stride * (n_steps - 1) + kernel_size * dilation
+        padding = [kernel_size // 2, kernel_size // 2]
+
+    else:
+        L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1
+
+        padding = [(L_in - L_out) // 2, (L_in - L_out) // 2]
+    return padding
+
+
+class Conv1d(nn.Module):
+
+    def __init__(
+        self,
+        out_channels,
+        kernel_size,
+        in_channels,
+        stride=1,
+        dilation=1,
+        padding='same',
+        groups=1,
+        bias=True,
+        padding_mode='reflect',
+    ):
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+        self.padding = padding
+        self.padding_mode = padding_mode
+
+        self.conv = nn.Conv1d(
+            in_channels,
+            out_channels,
+            self.kernel_size,
+            stride=self.stride,
+            dilation=self.dilation,
+            padding=0,
+            groups=groups,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        if self.padding == 'same':
+            x = self._manage_padding(x, self.kernel_size, self.dilation,
+                                     self.stride)
+
+        elif self.padding == 'causal':
+            num_pad = (self.kernel_size - 1) * self.dilation
+            x = F.pad(x, (num_pad, 0))
+
+        elif self.padding == 'valid':
+            pass
+
+        else:
+            raise ValueError(
+                "Padding must be 'same', 'valid' or 'causal'. Got "
+                + self.padding)
+
+        wx = self.conv(x)
+
+        return wx
+
+    def _manage_padding(
+        self,
+        x,
+        kernel_size: int,
+        dilation: int,
+        stride: int,
+    ):
+        L_in = x.shape[-1]
+        padding = get_padding_elem(L_in, stride, kernel_size, dilation)
+        x = F.pad(x, padding, mode=self.padding_mode)
+
+        return x
+
+
+class BatchNorm1d(nn.Module):
+
+    def __init__(
+        self,
+        input_size,
+        eps=1e-05,
+        momentum=0.1,
+    ):
+        super().__init__()
+        self.norm = nn.BatchNorm1d(
+            input_size,
+            eps=eps,
+            momentum=momentum,
+        )
+
+    def forward(self, x):
+        return self.norm(x)
+
+
+class TDNNBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        dilation,
+        activation=nn.ReLU,
+        groups=1,
+    ):
+        super(TDNNBlock, self).__init__()
+        self.conv = Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            dilation=dilation,
+            groups=groups,
+        )
+        self.activation = activation()
+        self.norm = BatchNorm1d(input_size=out_channels)
+
+    def forward(self, x):
+        return self.norm(self.activation(self.conv(x)))
+
+
+class Res2NetBlock(torch.nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 scale=8,
+                 kernel_size=3,
+                 dilation=1):
+        super(Res2NetBlock, self).__init__()
+        assert in_channels % scale == 0
+        assert out_channels % scale == 0
+
+        in_channel = in_channels // scale
+        hidden_channel = out_channels // scale
+
+        self.blocks = nn.ModuleList([
+            TDNNBlock(
+                in_channel,
+                hidden_channel,
+                kernel_size=kernel_size,
+                dilation=dilation,
+            ) for i in range(scale - 1)
+        ])
+        self.scale = scale
+
+    def forward(self, x):
+        y = []
+        for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)):
+            if i == 0:
+                y_i = x_i
+            elif i == 1:
+                y_i = self.blocks[i - 1](x_i)
+            else:
+                y_i = self.blocks[i - 1](x_i + y_i)
+            y.append(y_i)
+        y = torch.cat(y, dim=1)
+        return y
+
+
+class SEBlock(nn.Module):
+
+    def __init__(self, in_channels, se_channels, out_channels):
+        super(SEBlock, self).__init__()
+
+        self.conv1 = Conv1d(
+            in_channels=in_channels, out_channels=se_channels, kernel_size=1)
+        self.relu = torch.nn.ReLU(inplace=True)
+        self.conv2 = Conv1d(
+            in_channels=se_channels, out_channels=out_channels, kernel_size=1)
+        self.sigmoid = torch.nn.Sigmoid()
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+        if lengths is not None:
+            mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+            mask = mask.unsqueeze(1)
+            total = mask.sum(dim=2, keepdim=True)
+            s = (x * mask).sum(dim=2, keepdim=True) / total
+        else:
+            s = x.mean(dim=2, keepdim=True)
+
+        s = self.relu(self.conv1(s))
+        s = self.sigmoid(self.conv2(s))
+
+        return s * x
+
+
+class AttentiveStatisticsPooling(nn.Module):
+
+    def __init__(self, channels, attention_channels=128, global_context=True):
+        super().__init__()
+
+        self.eps = 1e-12
+        self.global_context = global_context
+        if global_context:
+            self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1)
+        else:
+            self.tdnn = TDNNBlock(channels, attention_channels, 1, 1)
+        self.tanh = nn.Tanh()
+        self.conv = Conv1d(
+            in_channels=attention_channels,
+            out_channels=channels,
+            kernel_size=1)
+
+    def forward(self, x, lengths=None):
+        L = x.shape[-1]
+
+        def _compute_statistics(x, m, dim=2, eps=self.eps):
+            mean = (m * x).sum(dim)
+            std = torch.sqrt(
+                (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps))
+            return mean, std
+
+        if lengths is None:
+            lengths = torch.ones(x.shape[0], device=x.device)
+
+        # Make binary mask of shape [N, 1, L]
+        mask = length_to_mask(lengths * L, max_len=L, device=x.device)
+        mask = mask.unsqueeze(1)
+
+        # Expand the temporal context of the pooling layer by allowing the
+        # self-attention to look at global properties of the utterance.
+        if self.global_context:
+            # torch.std is unstable for backward computation
+            # https://github.com/pytorch/pytorch/issues/4320
+            total = mask.sum(dim=2, keepdim=True).float()
+            mean, std = _compute_statistics(x, mask / total)
+            mean = mean.unsqueeze(2).repeat(1, 1, L)
+            std = std.unsqueeze(2).repeat(1, 1, L)
+            attn = torch.cat([x, mean, std], dim=1)
+        else:
+            attn = x
+
+        # Apply layers
+        attn = self.conv(self.tanh(self.tdnn(attn)))
+
+        # Filter out zero-paddings
+        attn = attn.masked_fill(mask == 0, float('-inf'))
+
+        attn = F.softmax(attn, dim=2)
+        mean, std = _compute_statistics(x, attn)
+        # Append mean and std of the batch
+        pooled_stats = torch.cat((mean, std), dim=1)
+        pooled_stats = pooled_stats.unsqueeze(2)
+
+        return pooled_stats
+
+
+class SERes2NetBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        res2net_scale=8,
+        se_channels=128,
+        kernel_size=1,
+        dilation=1,
+        activation=torch.nn.ReLU,
+        groups=1,
+    ):
+        super().__init__()
+        self.out_channels = out_channels
+        self.tdnn1 = TDNNBlock(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.res2net_block = Res2NetBlock(out_channels, out_channels,
+                                          res2net_scale, kernel_size, dilation)
+        self.tdnn2 = TDNNBlock(
+            out_channels,
+            out_channels,
+            kernel_size=1,
+            dilation=1,
+            activation=activation,
+            groups=groups,
+        )
+        self.se_block = SEBlock(out_channels, se_channels, out_channels)
+
+        self.shortcut = None
+        if in_channels != out_channels:
+            self.shortcut = Conv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=1,
+            )
+
+    def forward(self, x, lengths=None):
+        residual = x
+        if self.shortcut:
+            residual = self.shortcut(x)
+
+        x = self.tdnn1(x)
+        x = self.res2net_block(x)
+        x = self.tdnn2(x)
+        x = self.se_block(x, lengths)
+
+        return x + residual
+
+
+class ECAPA_TDNN(nn.Module):
+    """An implementation of the speaker embedding model in a paper.
+    "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in
+    TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143).
+    """
+
+    def __init__(
+        self,
+        input_size,
+        device='cpu',
+        lin_neurons=512,
+        activation=torch.nn.ReLU,
+        channels=[512, 512, 512, 512, 1536],
+        kernel_sizes=[5, 3, 3, 3, 1],
+        dilations=[1, 2, 3, 4, 1],
+        attention_channels=128,
+        res2net_scale=8,
+        se_channels=128,
+        global_context=True,
+        groups=[1, 1, 1, 1, 1],
+    ):
+
+        super().__init__()
+        assert len(channels) == len(kernel_sizes)
+        assert len(channels) == len(dilations)
+        self.channels = channels
+        self.blocks = nn.ModuleList()
+
+        # The initial TDNN layer
+        self.blocks.append(
+            TDNNBlock(
+                input_size,
+                channels[0],
+                kernel_sizes[0],
+                dilations[0],
+                activation,
+                groups[0],
+            ))
+
+        # SE-Res2Net layers
+        for i in range(1, len(channels) - 1):
+            self.blocks.append(
+                SERes2NetBlock(
+                    channels[i - 1],
+                    channels[i],
+                    res2net_scale=res2net_scale,
+                    se_channels=se_channels,
+                    kernel_size=kernel_sizes[i],
+                    dilation=dilations[i],
+                    activation=activation,
+                    groups=groups[i],
+                ))
+
+        # Multi-layer feature aggregation
+        self.mfa = TDNNBlock(
+            channels[-1],
+            channels[-1],
+            kernel_sizes[-1],
+            dilations[-1],
+            activation,
+            groups=groups[-1],
+        )
+
+        # Attentive Statistical Pooling
+        self.asp = AttentiveStatisticsPooling(
+            channels[-1],
+            attention_channels=attention_channels,
+            global_context=global_context,
+        )
+        self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2)
+
+        # Final linear transformation
+        self.fc = Conv1d(
+            in_channels=channels[-1] * 2,
+            out_channels=lin_neurons,
+            kernel_size=1,
+        )
+
+    def forward(self, x, lengths=None):
+        """Returns the embedding vector.
+
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of shape (batch, time, channel).
+        """
+        x = x.transpose(1, 2)
+
+        xl = []
+        for layer in self.blocks:
+            try:
+                x = layer(x, lengths=lengths)
+            except TypeError:
+                x = layer(x)
+            xl.append(x)
+
+        # Multi-layer feature aggregation
+        x = torch.cat(xl[1:], dim=1)
+        x = self.mfa(x)
+
+        # Attentive Statistical Pooling
+        x = self.asp(x, lengths=lengths)
+        x = self.asp_bn(x)
+
+        # Final linear transformation
+        x = self.fc(x)
+
+        x = x.transpose(1, 2).squeeze(1)
+        return x
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_.'
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l_ = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l_, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l_ - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+
+
+class SDPNHead(nn.Module):
+
+    def __init__(self,
+                 in_dim,
+                 use_bn=False,
+                 nlayers=3,
+                 hidden_dim=2048,
+                 bottleneck_dim=256):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        if nlayers == 1:
+            self.mlp = nn.Linear(in_dim, bottleneck_dim)
+        else:
+            layers = [nn.Linear(in_dim, hidden_dim)]
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+            for _ in range(nlayers - 2):
+                layers.append(nn.Linear(hidden_dim, hidden_dim))
+                if use_bn:
+                    layers.append(nn.BatchNorm1d(hidden_dim))
+                layers.append(nn.GELU())
+            layers.append(nn.Linear(hidden_dim, bottleneck_dim))
+            self.mlp = nn.Sequential(*layers)
+        self.apply(self._init_weights)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        x = nn.functional.normalize(x, dim=-1, p=2)
+        return x
+
+
+class Combiner(torch.nn.Module):
+    """
+    Combine backbone (ECAPA) and head (MLP)
+    """
+
+    def __init__(self, backbone, head):
+        super(Combiner, self).__init__()
+        self.backbone = backbone
+        self.head = head
+
+    def forward(self, x):
+        x = self.backbone(x)
+        output = self.head(x)
+        return x, output
+
+
+@MODELS.register_module(Tasks.speaker_verification, module_name=Models.sdpn_sv)
+class SpeakerVerificationSDPN(TorchModel):
+    """
+    Self-Distillation Prototypes Network (SDPN) effectively facilitates
+    self-supervised speaker representation learning. The specific structure can be
+    referred to in https://arxiv.org/pdf/2308.02774.
+    """
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.other_config = kwargs
+        if self.model_config['channel'] != 1024:
+            raise ValueError(
+                'modelscope error: Currently only 1024-channel ecapa tdnn is supported.'
+            )
+
+        self.feature_dim = 80
+        channels_config = [1024, 1024, 1024, 1024, 3072]
+
+        self.embedding_model = ECAPA_TDNN(
+            self.feature_dim, channels=channels_config)
+        self.embedding_model = Combiner(self.embedding_model,
+                                        SDPNHead(512, True))
+
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        assert len(audio.shape) == 2 and audio.shape[
+            0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]'
+        # audio shape: [1, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model.backbone(feature)
+
+        return embedding
+
+    def __extract_feature(self, audio):
+        feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim)
+        feature = feature - feature.mean(dim=0, keepdim=True)
+        feature = feature.unsqueeze(0)
+        return feature
+
+    def __load_check_point(self, pretrained_model_name, device=None):
+        if not device:
+            device = torch.device('cpu')
+        state_dict = torch.load(
+            os.path.join(self.model_dir, pretrained_model_name),
+            map_location=device)
+        state_dict_tea = {
+            k.replace('module.', ''): v
+            for k, v in state_dict['teacher'].items()
+        }
+        self.embedding_model.load_state_dict(state_dict_tea, strict=True)
diff --git a/modelscope/models/audio/sv/xvector.py b/modelscope/models/audio/sv/xvector.py
new file mode 100644
index 000000000..4a4c15a4a
--- /dev/null
+++ b/modelscope/models/audio/sv/xvector.py
@@ -0,0 +1,153 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+"""
+    This TDNN implementation is adapted from https://github.com/wenet-e2e/wespeaker.
+    TDNN replaces i-vectors for text-independent speaker verification with embeddings
+    extracted from a feedforward deep neural network. The specific structure can be
+    referred to in https://www.danielpovey.com/files/2017_interspeech_embeddings.pdf.
+"""
+import math
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchaudio.compliance.kaldi as Kaldi
+
+import modelscope.models.audio.sv.pooling_layers as pooling_layers
+from modelscope.metainfo import Models
+from modelscope.models import MODELS, TorchModel
+from modelscope.utils.constant import Tasks
+from modelscope.utils.device import create_device
+
+
+class TdnnLayer(nn.Module):
+
+    def __init__(self, in_dim, out_dim, context_size, dilation=1, padding=0):
+        """Define the TDNN layer, essentially 1-D convolution
+
+        Args:
+            in_dim (int): input dimension
+            out_dim (int): output channels
+            context_size (int): context size, essentially the filter size
+            dilation (int, optional):  Defaults to 1.
+            padding (int, optional):  Defaults to 0.
+        """
+        super(TdnnLayer, self).__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+        self.context_size = context_size
+        self.dilation = dilation
+        self.padding = padding
+        self.conv_1d = nn.Conv1d(
+            self.in_dim,
+            self.out_dim,
+            self.context_size,
+            dilation=self.dilation,
+            padding=self.padding)
+
+        # Set Affine=false to be compatible with the original kaldi version
+        self.bn = nn.BatchNorm1d(out_dim, affine=False)
+
+    def forward(self, x):
+        out = self.conv_1d(x)
+        out = F.relu(out)
+        out = self.bn(out)
+        return out
+
+
+class XVEC(nn.Module):
+
+    def __init__(self,
+                 feat_dim=40,
+                 hid_dim=512,
+                 stats_dim=1500,
+                 embed_dim=512,
+                 pooling_func='TSTP'):
+        """
+        Implementation of Kaldi style xvec, as described in
+        X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION
+        """
+        super(XVEC, self).__init__()
+        self.feat_dim = feat_dim
+        self.stats_dim = stats_dim
+        self.embed_dim = embed_dim
+
+        self.frame_1 = TdnnLayer(feat_dim, hid_dim, context_size=5, dilation=1)
+        self.frame_2 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=2)
+        self.frame_3 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=3)
+        self.frame_4 = TdnnLayer(hid_dim, hid_dim, context_size=1, dilation=1)
+        self.frame_5 = TdnnLayer(
+            hid_dim, stats_dim, context_size=1, dilation=1)
+        self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2
+        self.pool = getattr(pooling_layers, pooling_func)(
+            in_dim=self.stats_dim)
+        self.seg_1 = nn.Linear(self.stats_dim * self.n_stats, embed_dim)
+
+    def forward(self, x):
+        x = x.permute(0, 2, 1)  # (B,T,F) -> (B,F,T)
+
+        out = self.frame_1(x)
+        out = self.frame_2(out)
+        out = self.frame_3(out)
+        out = self.frame_4(out)
+        out = self.frame_5(out)
+
+        stats = self.pool(out)
+        embed_a = self.seg_1(stats)
+        return embed_a
+
+
+@MODELS.register_module(Tasks.speaker_verification, module_name=Models.tdnn_sv)
+class SpeakerVerificationTDNN(TorchModel):
+
+    def __init__(self, model_dir, model_config: Dict[str, Any], *args,
+                 **kwargs):
+        super().__init__(model_dir, model_config, *args, **kwargs)
+        self.model_config = model_config
+        self.other_config = kwargs
+
+        self.feature_dim = 80
+        self.embed_dim = 512
+        self.device = create_device(self.other_config['device'])
+        print(self.device)
+
+        self.embedding_model = XVEC(
+            feat_dim=self.feature_dim, embed_dim=self.embed_dim)
+        pretrained_model_name = kwargs['pretrained_model']
+        self.__load_check_point(pretrained_model_name)
+
+        self.embedding_model.to(self.device)
+        self.embedding_model.eval()
+
+    def forward(self, audio):
+        if isinstance(audio, np.ndarray):
+            audio = torch.from_numpy(audio)
+        if len(audio.shape) == 1:
+            audio = audio.unsqueeze(0)
+        assert len(
+            audio.shape
+        ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]'
+        # audio shape: [N, T]
+        feature = self.__extract_feature(audio)
+        embedding = self.embedding_model(feature.to(self.device))
+
+        return embedding.detach().cpu()
+
+    def __extract_feature(self, audio):
+        features = []
+        for au in audio:
+            feature = Kaldi.fbank(
+                au.unsqueeze(0), num_mel_bins=self.feature_dim)
+            feature = feature - feature.mean(dim=0, keepdim=True)
+            features.append(feature.unsqueeze(0))
+        features = torch.cat(features)
+        return features
+
+    def __load_check_point(self, pretrained_model_name):
+        self.embedding_model.load_state_dict(
+            torch.load(
+                os.path.join(self.model_dir, pretrained_model_name),
+                map_location=torch.device('cpu')),
+            strict=True)
diff --git a/modelscope/models/audio/tts/__init__.py b/modelscope/models/audio/tts/__init__.py
index 8af35c5a3..38420985d 100644
--- a/modelscope/models/audio/tts/__init__.py
+++ b/modelscope/models/audio/tts/__init__.py
@@ -5,9 +5,13 @@
 
 if TYPE_CHECKING:
     from .sambert_hifi import SambertHifigan
+    from .laura_codec import LauraCodecGenModel
 
 else:
-    _import_structure = {'sambert_hifi': ['SambertHifigan']}
+    _import_structure = {
+        'sambert_hifi': ['SambertHifigan'],
+        'laura_codec': ['LauraCodecGenModel'],
+    }
     import sys
     sys.modules[__name__] = LazyImportModule(
         __name__,
diff --git a/modelscope/models/audio/punc/generic_punctuation.py b/modelscope/models/audio/tts/laura_codec.py
similarity index 52%
rename from modelscope/models/audio/punc/generic_punctuation.py
rename to modelscope/models/audio/tts/laura_codec.py
index dabb60905..0e50321ce 100644
--- a/modelscope/models/audio/punc/generic_punctuation.py
+++ b/modelscope/models/audio/tts/laura_codec.py
@@ -8,30 +8,31 @@
 from modelscope.models.builder import MODELS
 from modelscope.utils.constant import Frameworks, Tasks
 
+__all__ = ['LauraCodecGenModel']
 
-@MODELS.register_module(Tasks.punctuation, module_name=Models.generic_punc)
-class PunctuationProcessing(Model):
 
-    def __init__(self, model_dir: str, punc_model_name: str,
-                 punc_model_config: Dict[str, Any], *args, **kwargs):
+@MODELS.register_module(Tasks.text_to_speech, module_name=Models.laura_codec)
+class LauraCodecGenModel(Model):
+
+    def __init__(self, model_dir: str, model_name: str,
+                 model_config: Dict[str, Any], *args, **kwargs):
         """initialize the info of model.
 
         Args:
             model_dir (str): the model path.
-            punc_model_name (str): the itn model name from configuration.json
-            punc_model_config (Dict[str, Any]): the detail config about model from configuration.json
+            model_name (str): the itn model name from configuration.json
+            model_config (Dict[str, Any]): the detail config about model from configuration.json
         """
-        super().__init__(model_dir, punc_model_name, punc_model_config, *args,
-                         **kwargs)
+        super().__init__(model_dir, model_name, model_config, *args, **kwargs)
         self.model_cfg = {
             # the recognition model dir path
             'model_workspace': model_dir,
             # the itn model name
-            'punc_model': punc_model_name,
+            'model_name': model_name,
             # the am model file path
-            'punc_model_path': os.path.join(model_dir, punc_model_name),
+            'model_path': os.path.join(model_dir, model_name),
             # the recognition model config dict
-            'model_config': punc_model_config
+            'model_config': model_config
         }
 
     def forward(self) -> Dict[str, Any]:
diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py
index b57fba53e..f2bba487c 100644
--- a/modelscope/models/builder.py
+++ b/modelscope/models/builder.py
@@ -13,7 +13,7 @@
 BACKBONES = MODELS
 HEADS = Registry('heads')
 
-modules = LazyImportModule.AST_INDEX[INDEX_KEY]
+modules = LazyImportModule.get_ast_index()[INDEX_KEY]
 for module_index in list(modules.keys()):
     if module_index[1] == Tasks.backbone and module_index[0] == 'BACKBONES':
         modules[(MODELS.name.upper(), module_index[1],
diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py
index 5da87a001..2bf632f8e 100644
--- a/modelscope/models/cv/__init__.py
+++ b/modelscope/models/cv/__init__.py
@@ -4,14 +4,16 @@
 from . import (action_recognition, animal_recognition, bad_image_detecting,
                body_2d_keypoints, body_3d_keypoints, cartoon,
                cmdssl_video_embedding, controllable_image_generation,
-               crowd_counting, face_detection, face_generation,
-               face_reconstruction, human3d_animation, human_reconstruction,
-               image_classification, image_color_enhance, image_colorization,
-               image_defrcn_fewshot, image_denoise, image_editing,
-               image_inpainting, image_instance_segmentation, image_matching,
-               image_mvs_depth_estimation, image_panoptic_segmentation,
-               image_portrait_enhancement, image_probing_model,
-               image_quality_assessment_degradation,
+               crowd_counting, dense_optical_flow_estimation, face_detection,
+               face_generation, face_reconstruction, human3d_animation,
+               human_reconstruction, image_classification, image_color_enhance,
+               image_colorization, image_defrcn_fewshot, image_denoise,
+               image_editing, image_inpainting, image_instance_segmentation,
+               image_local_feature_matching, image_matching,
+               image_matching_fast, image_mvs_depth_estimation,
+               image_mvs_depth_estimation_geomvsnet,
+               image_panoptic_segmentation, image_portrait_enhancement,
+               image_probing_model, image_quality_assessment_degradation,
                image_quality_assessment_man, image_quality_assessment_mos,
                image_reid_person, image_restoration,
                image_semantic_segmentation, image_super_resolution_pasd,
diff --git a/modelscope/models/cv/action_detection/modules/resnet.py b/modelscope/models/cv/action_detection/modules/resnet.py
index 7f5529a48..435aea528 100644
--- a/modelscope/models/cv/action_detection/modules/resnet.py
+++ b/modelscope/models/cv/action_detection/modules/resnet.py
@@ -233,7 +233,7 @@ def __init__(self,
             ops=ops[sum(layers[:3], 0):][:layers[3]])
         if num_classes is not None:
             self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-            self.sptial_atten = nn.Conv2d(2, 1, kernel_size=7, padding=3)
+            self.spatial_atten = nn.Conv2d(2, 1, kernel_size=7, padding=3)
             self.drop = nn.Dropout(0.5)
             if reduce_dim > 0:
                 self.rd_conv = nn.Conv2d(
@@ -308,7 +308,7 @@ def features(self, x):
             ftr = torch.cat(
                 (x.max(dim=1, keepdim=True)[0], x.mean(dim=1, keepdim=True)),
                 dim=1)
-            score = self.sptial_atten(ftr)  # N,1,H,W
+            score = self.spatial_atten(ftr)  # N,1,H,W
             x = x * torch.sigmoid(score)  # N,C,H,W
             self.score = score
 
diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py
index 46e768927..fa271b471 100644
--- a/modelscope/models/cv/action_recognition/s3dg.py
+++ b/modelscope/models/cv/action_recognition/s3dg.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from https://github.com/TengdaHan/CoCLR,
-# made pubicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR
+# made publicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR
 # Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 import torch
 import torch.nn as nn
@@ -47,7 +47,7 @@ class InceptionBlock3D(nn.Module):
     Element constructing the S3D/S3DG.
     See models/base/backbone.py L99-186.
 
-    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
     """
 
     def __init__(self, cfg, in_planes, out_planes):
@@ -139,7 +139,7 @@ class STConv3d(nn.Module):
     Element constructing the S3D/S3DG.
     See models/base/backbone.py L99-186.
 
-    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
     """
 
     def __init__(self,
@@ -213,7 +213,7 @@ def forward(self, x):
 class Inception3D(nn.Module):
     """
     Backbone architecture for I3D/S3DG.
-    Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
+    Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py.
     """
 
     def __init__(self, cfg):
diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py
index b1de7af8f..cf9738251 100644
--- a/modelscope/models/cv/action_recognition/tada_convnext.py
+++ b/modelscope/models/cv/action_recognition/tada_convnext.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from https://github.com/facebookresearch/ConvNeXt,
-# made pubicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt
+# made publicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt
 # Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved.
 
 import math
diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py
index d7c03c299..44e44722a 100644
--- a/modelscope/models/cv/animal_recognition/resnet.py
+++ b/modelscope/models/cv/animal_recognition/resnet.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from Split-Attention Network, A New ResNet Variant,
-# made pubicly available under the Apache License 2.0 License
+# made publicly available under the Apache License 2.0 License
 # at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnet.py
 import math
 
diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py
index a10d0abe1..09d65b6de 100644
--- a/modelscope/models/cv/animal_recognition/splat.py
+++ b/modelscope/models/cv/animal_recognition/splat.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from Split-Attention Network, A New ResNet Variant,
-# made pubicly available under the Apache License 2.0 License
+# made publicly available under the Apache License 2.0 License
 # at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/splat.py
 """Split-Attention"""
 
diff --git a/modelscope/models/cv/anydoor/__init__.py b/modelscope/models/cv/anydoor/__init__.py
new file mode 100644
index 000000000..0eb176c42
--- /dev/null
+++ b/modelscope/models/cv/anydoor/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .anydoor_model import ControlLDM
+
+else:
+    _import_structure = {'anydoor_model': ['ControlLDM']}
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/anydoor/anydoor_model.py b/modelscope/models/cv/anydoor/anydoor_model.py
new file mode 100644
index 000000000..6e9316b74
--- /dev/null
+++ b/modelscope/models/cv/anydoor/anydoor_model.py
@@ -0,0 +1,519 @@
+import einops
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+
+from modelscope import Model
+from modelscope.metainfo import Models
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import Tasks
+from .cldm.ddim_hacked import DDIMSampler
+from .ldm.models.diffusion.ddpm import LatentDiffusion
+from .ldm.modules.attention import SpatialTransformer
+from .ldm.modules.diffusionmodules.openaimodel import (AttentionBlock,
+                                                       Downsample, ResBlock,
+                                                       TimestepEmbedSequential,
+                                                       UNetModel)
+from .ldm.modules.diffusionmodules.util import (conv_nd, linear,
+                                                timestep_embedding,
+                                                zero_module)
+from .ldm.util import exists
+
+
+class ControlledUnetModel(UNetModel):
+
+    def forward(self,
+                x,
+                timesteps=None,
+                context=None,
+                control=None,
+                only_mid_control=False,
+                **kwargs):
+        hs = []
+        with torch.no_grad():
+            t_emb = timestep_embedding(
+                timesteps, self.model_channels, repeat_only=False)
+            emb = self.time_embed(t_emb)
+            h = x.type(self.dtype)
+            for module in self.input_blocks:
+                h = module(h, emb, context)
+                hs.append(h)
+            h = self.middle_block(h, emb, context)
+
+        if control is not None:
+            h += control.pop()
+
+        for i, module in enumerate(self.output_blocks):
+            if only_mid_control or control is None:
+                h = torch.cat([h, hs.pop()], dim=1)
+            else:
+                h = torch.cat([h, hs.pop() + control.pop()], dim=1)
+            h = module(h, emb, context)
+
+        h = h.type(x.dtype)
+        return self.out(h)
+
+
+class ControlNet(nn.Module):
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        hint_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Need to include the dimension of your cross-attention conditioning'
+
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Need to use the spatial transformer for your cross-attention conditioning'
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.dims = dims
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    'provide num_res_blocks either as an int (globally constant) or '
+                    'as a list/tuple (per-level) with the same length as channel_mult'
+                )
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i
+                                                                             ],
+                    range(len(num_attention_blocks))))
+            print(
+                f'Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. '
+                f'This option has LESS priority than attention_resolutions {attention_resolutions}, '
+                f'i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, '
+                f'attention will still not be set.')
+
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = torch.float16 if use_fp16 else torch.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        self.input_blocks = nn.ModuleList([
+            TimestepEmbedSequential(
+                conv_nd(dims, in_channels, model_channels, 3, padding=1))
+        ])
+        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])
+
+        self.input_hint_block = TimestepEmbedSequential(
+            conv_nd(dims, hint_channels, 16, 3, padding=1), nn.SiLU(),
+            conv_nd(dims, 16, 16, 3, padding=1), nn.SiLU(),
+            conv_nd(dims, 16, 32, 3, padding=1, stride=2), nn.SiLU(),
+            conv_nd(dims, 32, 32, 3, padding=1), nn.SiLU(),
+            conv_nd(dims, 32, 96, 3, padding=1, stride=2), nn.SiLU(),
+            conv_nd(dims, 96, 96, 3, padding=1), nn.SiLU(),
+            conv_nd(dims, 96, 256, 3, padding=1, stride=2), nn.SiLU(),
+            zero_module(conv_nd(dims, 256, model_channels, 3, padding=1)))
+
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks
+                                  ) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth,
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint))
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self.zero_convs.append(self.make_zero_conv(ch))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        ) if resblock_updown else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                self.zero_convs.append(self.make_zero_conv(ch))
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else
+            SpatialTransformer(  # always uses a self-attn
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth,
+                context_dim=context_dim,
+                disable_self_attn=disable_middle_self_attn,
+                use_linear=use_linear_in_transformer,
+                use_checkpoint=use_checkpoint),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self.middle_block_out = self.make_zero_conv(ch)
+        self._feature_size += ch
+
+    def make_zero_conv(self, channels):
+        return TimestepEmbedSequential(
+            zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
+
+    def forward(self, x, hint, timesteps, context, **kwargs):
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)  # 1,1280
+
+        # 1,320,64,64
+        guided_hint = self.input_hint_block(hint, emb, context)
+        outs = []
+
+        h = x.type(self.dtype)
+        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
+            if guided_hint is not None:
+                # skip the first layer
+                h = guided_hint
+                guided_hint = None
+            else:
+                h_new = module(h, emb, context)
+                h = h_new
+            outs.append(zero_conv(h, emb, context))
+
+        h_new = self.middle_block(h, emb, context)
+        outs.append(self.middle_block_out(h_new, emb, context))
+        return outs
+
+
+@MODELS.register_module(
+    Tasks.image_to_image_generation, module_name=Models.anydoor)
+class ControlLDM(LatentDiffusion, Model):
+    '''
+    This work presents AnyDoor, a diffusion-based image generator
+    with the power to teleport target objects to new scenes
+    at user-specified locations in a harmonious way.
+
+    Instead of tuning parameters for each object, our model
+    is trained only once and effortlessly generalizes
+    to diverse object-scene combinations at the inference stage.
+
+    arxiv: https://arxiv.org/abs/2307.09481
+    '''
+
+    def __init__(self, control_stage_config, control_key, only_mid_control,
+                 *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.control_model = ControlNet(**control_stage_config)
+        self.control_key = control_key
+        self.only_mid_control = only_mid_control
+        self.control_scales = [1.0] * 13
+
+    @torch.no_grad()
+    def get_input(self, batch, k, bs=None, *args, **kwargs):
+        x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs)
+        control = batch[self.control_key]
+        if bs is not None:
+            control = control[:bs]
+        control = control.to(self.device)
+        control = einops.rearrange(control, 'b h w c -> b c h w')
+        control = control.to(memory_format=torch.contiguous_format).float()
+        self.time_steps = batch['time_steps']
+        return x, dict(c_crossattn=[c], c_concat=[control])
+
+    def apply_model(self, x_noisy, t, cond, *args, **kwargs):
+        assert isinstance(cond, dict)
+        diffusion_model = self.model.diffusion_model
+
+        cond_txt = torch.cat(cond['c_crossattn'], 1)
+
+        if cond['c_concat'] is None:
+            eps = diffusion_model(
+                x=x_noisy,
+                timesteps=t,
+                context=cond_txt,
+                control=None,
+                only_mid_control=self.only_mid_control)
+        else:
+            control = self.control_model(
+                x=x_noisy,
+                hint=torch.cat(cond['c_concat'], 1),
+                timesteps=t,
+                context=cond_txt)
+            control = [
+                c * scale for c, scale in zip(control, self.control_scales)
+            ]
+            eps = diffusion_model(
+                x=x_noisy,
+                timesteps=t,
+                context=cond_txt,
+                control=control,
+                only_mid_control=self.only_mid_control)
+        return eps
+
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, N):
+        uncond = self.get_learned_conditioning([torch.zeros(
+            (1, 3, 224, 224))] * N)
+        return uncond
+
+    @torch.no_grad()
+    def log_images(self,
+                   batch,
+                   N=4,
+                   n_row=2,
+                   sample=False,
+                   ddim_steps=50,
+                   ddim_eta=0.0,
+                   return_keys=None,
+                   quantize_denoised=True,
+                   inpaint=True,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=False,
+                   unconditional_guidance_scale=9.0,
+                   unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        use_ddim = ddim_steps is not None
+
+        log = dict()
+        z, c = self.get_input(batch, self.first_stage_key, bs=N)
+        c_cat, c = c['c_concat'][0][:N], c['c_crossattn'][0][:N]
+        N = min(z.shape[0], N)
+        n_row = min(z.shape[0], n_row)
+        log['reconstruction'] = self.decode_first_stage(z)
+
+        # ==== visualize the shape mask or the high-frequency map ====
+        guide_mask = (c_cat[:, -1, :, :].unsqueeze(1) + 1) * 0.5
+        guide_mask = torch.cat([guide_mask, guide_mask, guide_mask], 1)
+        HF_map = c_cat[:, :3, :, :]  # * 2.0 - 1.0
+
+        log['control'] = HF_map
+
+        cond_image = batch[self.cond_stage_key].cpu().numpy().copy()
+        log['conditioning'] = torch.permute(
+            torch.tensor(cond_image), (0, 3, 1, 2)) * 2.0 - 1.0
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+
+            diffusion_row = torch.stack(
+                diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid,
+                                       'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(
+                diffusion_grid, nrow=diffusion_row.shape[0])
+            log['diffusion_row'] = diffusion_grid
+
+        if sample:
+            # get denoise row
+            samples, z_denoise_row = self.sample_log(
+                cond={
+                    'c_concat': [c_cat],
+                    'c_crossattn': [c]
+                },
+                batch_size=N,
+                ddim=use_ddim,
+                ddim_steps=ddim_steps,
+                eta=ddim_eta)
+            x_samples = self.decode_first_stage(samples)
+            log['samples'] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log['denoise_row'] = denoise_grid
+
+        if unconditional_guidance_scale > 1.0:
+            uc_cross = self.get_unconditional_conditioning(N)
+            uc_cat = c_cat  # torch.zeros_like(c_cat)
+            uc_full = {'c_concat': [uc_cat], 'c_crossattn': [uc_cross]}
+            samples_cfg, _ = self.sample_log(
+                cond={
+                    'c_concat': [c_cat],
+                    'c_crossattn': [c]
+                },
+                batch_size=N,
+                ddim=use_ddim,
+                ddim_steps=ddim_steps,
+                eta=ddim_eta,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=uc_full,
+            )
+            x_samples_cfg = self.decode_first_stage(samples_cfg)
+            log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg  # * 2.0 - 1.0
+        return log
+
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        ddim_sampler = DDIMSampler(self)
+        b, c, h, w = cond['c_concat'][0].shape
+        shape = (self.channels, h // 8, w // 8)
+        samples, intermediates = ddim_sampler.sample(
+            ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
+        return samples, intermediates
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.control_model.parameters())
+        if not self.sd_locked:
+            params += list(
+                self.model.diffusion_model.output_blocks.parameters())
+            params += list(self.model.diffusion_model.out.parameters())
+        params += list(self.cond_stage_model.projector.parameters())
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt
+
+    def low_vram_shift(self, is_diffusing):
+        if is_diffusing:
+            self.model = self.model.cuda()
+            self.control_model = self.control_model.cuda()
+            self.first_stage_model = self.first_stage_model.cpu()
+            self.cond_stage_model = self.cond_stage_model.cpu()
+        else:
+            self.model = self.model.cpu()
+            self.control_model = self.control_model.cpu()
+            self.first_stage_model = self.first_stage_model.cuda()
+            self.cond_stage_model = self.cond_stage_model.cuda()
diff --git a/modelscope/models/cv/anydoor/cldm/__init__.py b/modelscope/models/cv/anydoor/cldm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/cldm/ddim_hacked.py b/modelscope/models/cv/anydoor/cldm/ddim_hacked.py
new file mode 100644
index 000000000..e6adf5716
--- /dev/null
+++ b/modelscope/models/cv/anydoor/cldm/ddim_hacked.py
@@ -0,0 +1,428 @@
+"""SAMPLING ONLY."""
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from ..ldm.modules.diffusionmodules.util import (extract_into_tensor,
+                                                 make_ddim_sampling_parameters,
+                                                 make_ddim_timesteps,
+                                                 noise_like)
+
+
+class DDIMSampler(object):
+
+    def __init__(self, model, schedule='linear', **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device('cuda'):
+                attr = attr.to(torch.device('cuda'))
+        setattr(self, name, attr)
+
+    def make_schedule(self,
+                      ddim_num_steps,
+                      ddim_discretize='uniform',
+                      ddim_eta=0.,
+                      verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[
+            0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+
+        def to_torch(x):
+            return x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(self.model.alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas',
+                             np.sqrt(1. - ddim_alphas))
+        tmp1 = (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod)
+        tmp2 = (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(tmp1 * tmp2)
+        self.register_buffer('ddim_sigmas_for_original_num_steps',
+                             sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(
+            self,
+            S,
+            batch_size,
+            shape,
+            conditioning=None,
+            callback=None,
+            normals_sequence=None,
+            img_callback=None,
+            quantize_x0=False,
+            eta=0.,
+            mask=None,
+            x0=None,
+            temperature=1.,
+            noise_dropout=0.,
+            score_corrector=None,
+            corrector_kwargs=None,
+            verbose=True,
+            x_T=None,
+            log_every_t=100,
+            unconditional_guidance_scale=1.,
+            unconditional_conditioning=None,  # this has to come in the same format as the conditioning
+            dynamic_threshold=None,
+            ucg_schedule=None,
+            **kwargs):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                ctmp = conditioning[list(conditioning.keys())[0]]
+                while isinstance(ctmp, list):
+                    ctmp = ctmp[0]
+                cbs = ctmp.shape[0]
+                if cbs != batch_size:
+                    print(
+                        f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                    )
+
+            elif isinstance(conditioning, list):
+                for ctmp in conditioning:
+                    if ctmp.shape[0] != batch_size:
+                        print(
+                            f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                        )
+
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(
+                        f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
+                    )
+
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+
+        samples, intermediates = self.ddim_sampling(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            dynamic_threshold=dynamic_threshold,
+            ucg_schedule=ucg_schedule)
+        return samples, intermediates
+
+    @torch.no_grad()
+    def ddim_sampling(self,
+                      cond,
+                      shape,
+                      x_T=None,
+                      ddim_use_original_steps=False,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None,
+                      x0=None,
+                      img_callback=None,
+                      log_every_t=100,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      dynamic_threshold=None,
+                      ucg_schedule=None):
+        device = self.model.betas.device
+        b = shape[0]
+        # x_T 1,4,64,64
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(
+                min(timesteps / self.ddim_timesteps.shape[0], 1)
+                * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(
+            0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[
+            0]
+        print(f'Running DDIM Sampling with {total_steps} timesteps')
+
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b, ), step, device=device, dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(
+                    x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            if ucg_schedule is not None:
+                assert len(ucg_schedule) == len(time_range)
+                unconditional_guidance_scale = ucg_schedule[i]
+
+            outs = self.p_sample_ddim(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                dynamic_threshold=dynamic_threshold)
+            img, pred_x0 = outs
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_ddim(self,
+                      x,
+                      c,
+                      t,
+                      index,
+                      repeat_noise=False,
+                      use_original_steps=False,
+                      quantize_denoised=False,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      dynamic_threshold=None):
+        b, *_, device = *x.shape, x.device
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            model_output = self.model.apply_model(x, t, c)
+        else:
+            model_t = self.model.apply_model(x, t, c)
+            model_uncond = self.model.apply_model(x, t,
+                                                  unconditional_conditioning)
+            model_output = model_uncond + unconditional_guidance_scale * (
+                model_t - model_uncond)
+
+        if self.model.parameterization == 'v':
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
+
+        if score_corrector is not None:
+            assert self.model.parameterization == 'eps', 'not implemented'
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c,
+                                               **corrector_kwargs)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod \
+            if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1),
+                                       sqrt_one_minus_alphas[index],
+                                       device=device)
+
+        # current prediction for x_0
+        if self.model.parameterization != 'v':
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
+
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+
+        if dynamic_threshold is not None:
+            raise NotImplementedError()
+
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device,
+                                     repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0
+
+    @torch.no_grad()
+    def encode(self,
+               x0,
+               c,
+               t_enc,
+               use_original_steps=False,
+               return_intermediates=None,
+               unconditional_guidance_scale=1.0,
+               unconditional_conditioning=None,
+               callback=None):
+        timesteps = np.arange(self.ddpm_num_timesteps
+                              ) if use_original_steps else self.ddim_timesteps
+        num_reference_steps = timesteps.shape[0]
+
+        assert t_enc <= num_reference_steps
+        num_steps = t_enc
+
+        if use_original_steps:
+            alphas_next = self.alphas_cumprod[:num_steps]
+            alphas = self.alphas_cumprod_prev[:num_steps]
+        else:
+            alphas_next = self.ddim_alphas[:num_steps]
+            alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
+
+        x_next = x0
+        intermediates = []
+        inter_steps = []
+        for i in tqdm(range(num_steps), desc='Encoding Image'):
+            t = torch.full((x0.shape[0], ),
+                           timesteps[i],
+                           device=self.model.device,
+                           dtype=torch.long)
+            if unconditional_guidance_scale == 1.:
+                noise_pred = self.model.apply_model(x_next, t, c)
+            else:
+                assert unconditional_conditioning is not None
+                e_t_uncond, noise_pred = torch.chunk(
+                    self.model.apply_model(
+                        torch.cat((x_next, x_next)), torch.cat((t, t)),
+                        torch.cat((unconditional_conditioning, c))), 2)
+                noise_pred = e_t_uncond + unconditional_guidance_scale * (
+                    noise_pred - e_t_uncond)
+
+            xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
+            tmp = (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()
+            weighted_noise_pred = alphas_next[i].sqrt() * tmp * noise_pred
+            x_next = xt_weighted + weighted_noise_pred
+            if return_intermediates and i % (num_steps // return_intermediates
+                                             ) == 0 and i < num_steps - 1:
+                intermediates.append(x_next)
+                inter_steps.append(i)
+            elif return_intermediates and i >= num_steps - 2:
+                intermediates.append(x_next)
+                inter_steps.append(i)
+            if callback:
+                callback(i)
+
+        out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
+        if return_intermediates:
+            out.update({'intermediates': intermediates})
+        return x_next, out
+
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return (
+            extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0
+            + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape)
+            * noise)
+
+    @torch.no_grad()
+    def decode(self,
+               x_latent,
+               cond,
+               t_start,
+               unconditional_guidance_scale=1.0,
+               unconditional_conditioning=None,
+               use_original_steps=False,
+               callback=None):
+
+        timesteps = np.arange(self.ddpm_num_timesteps
+                              ) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f'Running DDIM Sampling with {total_steps} timesteps')
+
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0], ),
+                            step,
+                            device=x_latent.device,
+                            dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(
+                x_dec,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=use_original_steps,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning)
+            if callback:
+                callback(i)
+        return x_dec
diff --git a/modelscope/models/cv/anydoor/datasets/__init__.py b/modelscope/models/cv/anydoor/datasets/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/datasets/data_utils.py b/modelscope/models/cv/anydoor/datasets/data_utils.py
new file mode 100644
index 000000000..82d41b1cf
--- /dev/null
+++ b/modelscope/models/cv/anydoor/datasets/data_utils.py
@@ -0,0 +1,364 @@
+import cv2
+import numpy as np
+import torch
+
+
+def mask_score(mask):
+    '''Scoring the mask according to connectivity.'''
+    mask = mask.astype(np.uint8)
+    if mask.sum() < 10:
+        return 0
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL,
+                                   cv2.CHAIN_APPROX_NONE)
+    cnt_area = [cv2.contourArea(cnt) for cnt in contours]
+    conc_score = np.max(cnt_area) / sum(cnt_area)
+    return conc_score
+
+
+def sobel(img, mask, thresh=50):
+    '''Calculating the high-frequency map.'''
+    H, W = img.shape[0], img.shape[1]
+    img = cv2.resize(img, (256, 256))
+    mask = (cv2.resize(mask, (256, 256)) > 0.5).astype(np.uint8)
+    kernel = np.ones((5, 5), np.uint8)
+    mask = cv2.erode(mask, kernel, iterations=2)
+
+    Ksize = 3
+    sobelx = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=Ksize)
+    sobely = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=Ksize)
+    sobel_X = cv2.convertScaleAbs(sobelx)
+    sobel_Y = cv2.convertScaleAbs(sobely)
+    scharr = cv2.addWeighted(sobel_X, 0.5, sobel_Y, 0.5, 0)
+    scharr = np.max(scharr, -1) * mask
+
+    scharr[scharr < thresh] = 0.0
+    scharr = np.stack([scharr, scharr, scharr], -1)
+    scharr = (scharr.astype(np.float32) / 255 * img.astype(np.float32)).astype(
+        np.uint8)
+    scharr = cv2.resize(scharr, (W, H))
+    return scharr
+
+
+def resize_and_pad(image, box):
+    '''Fitting an image to the box region while keeping the aspect ratio.'''
+    y1, y2, x1, x2 = box
+    H, W = y2 - y1, x2 - x1
+    h, w = image.shape[0], image.shape[1]
+    r_box = W / H
+    r_image = w / h
+    if r_box >= r_image:
+        h_target = H
+        w_target = int(w * H / h)
+        image = cv2.resize(image, (w_target, h_target))
+
+        w1 = (W - w_target) // 2
+        w2 = W - w_target - w1
+        pad_param = ((0, 0), (w1, w2), (0, 0))
+        image = np.pad(image, pad_param, 'constant', constant_values=255)
+    else:
+        w_target = W
+        h_target = int(h * W / w)
+        image = cv2.resize(image, (w_target, h_target))
+
+        h1 = (H - h_target) // 2
+        h2 = H - h_target - h1
+        pad_param = ((h1, h2), (0, 0), (0, 0))
+        image = np.pad(image, pad_param, 'constant', constant_values=255)
+    return image
+
+
+def expand_image_mask(image, mask, ratio=1.4):
+    h, w = image.shape[0], image.shape[1]
+    H, W = int(h * ratio), int(w * ratio)
+    h1 = int((H - h) // 2)
+    h2 = H - h - h1
+    w1 = int((W - w) // 2)
+    w2 = W - w - w1
+
+    pad_param_image = ((h1, h2), (w1, w2), (0, 0))
+    pad_param_mask = ((h1, h2), (w1, w2))
+    image = np.pad(image, pad_param_image, 'constant', constant_values=255)
+    mask = np.pad(mask, pad_param_mask, 'constant', constant_values=0)
+    return image, mask
+
+
+def resize_box(yyxx, H, W, h, w):
+    y1, y2, x1, x2 = yyxx
+    y1, y2 = int(y1 / H * h), int(y2 / H * h)
+    x1, x2 = int(x1 / W * w), int(x2 / W * w)
+    y1, y2 = min(y1, h), min(y2, h)
+    x1, x2 = min(x1, w), min(x2, w)
+    return (y1, y2, x1, x2)
+
+
+def get_bbox_from_mask(mask):
+    h, w = mask.shape[0], mask.shape[1]
+
+    if mask.sum() < 10:
+        return 0, h, 0, w
+    rows = np.any(mask, axis=1)
+    cols = np.any(mask, axis=0)
+    y1, y2 = np.where(rows)[0][[0, -1]]
+    x1, x2 = np.where(cols)[0][[0, -1]]
+    return (y1, y2, x1, x2)
+
+
+def expand_bbox(mask, yyxx, ratio=[1.2, 2.0], min_crop=0):
+    y1, y2, x1, x2 = yyxx
+    ratio = np.random.randint(ratio[0] * 10, ratio[1] * 10) / 10
+    H, W = mask.shape[0], mask.shape[1]
+    xc, yc = 0.5 * (x1 + x2), 0.5 * (y1 + y2)
+    h = ratio * (y2 - y1 + 1)
+    w = ratio * (x2 - x1 + 1)
+    h = max(h, min_crop)
+    w = max(w, min_crop)
+
+    x1 = int(xc - w * 0.5)
+    x2 = int(xc + w * 0.5)
+    y1 = int(yc - h * 0.5)
+    y2 = int(yc + h * 0.5)
+
+    x1 = max(0, x1)
+    x2 = min(W, x2)
+    y1 = max(0, y1)
+    y2 = min(H, y2)
+    return (y1, y2, x1, x2)
+
+
+def box2squre(image, box):
+    H, W = image.shape[0], image.shape[1]
+    y1, y2, x1, x2 = box
+    cx = (x1 + x2) // 2
+    cy = (y1 + y2) // 2
+    h, w = y2 - y1, x2 - x1
+
+    if h >= w:
+        x1 = cx - h // 2
+        x2 = cx + h // 2
+    else:
+        y1 = cy - w // 2
+        y2 = cy + w // 2
+    x1 = max(0, x1)
+    x2 = min(W, x2)
+    y1 = max(0, y1)
+    y2 = min(H, y2)
+    return (y1, y2, x1, x2)
+
+
+def pad_to_square(image, pad_value=255, random=False):
+    H, W = image.shape[0], image.shape[1]
+    if H == W:
+        return image
+
+    padd = abs(H - W)
+    if random:
+        padd_1 = int(np.random.randint(0, padd))
+    else:
+        padd_1 = int(padd / 2)
+    padd_2 = padd - padd_1
+
+    if H > W:
+        pad_param = ((0, 0), (padd_1, padd_2), (0, 0))
+    else:
+        pad_param = ((padd_1, padd_2), (0, 0), (0, 0))
+
+    image = np.pad(image, pad_param, 'constant', constant_values=pad_value)
+    return image
+
+
+def box_in_box(small_box, big_box):
+    y1, y2, x1, x2 = small_box
+    y1_b, _, x1_b, _ = big_box
+    y1, y2, x1, x2 = y1 - y1_b, y2 - y1_b, x1 - x1_b, x2 - x1_b
+    return (y1, y2, x1, x2)
+
+
+def shuffle_image(image, N):
+    height, width = image.shape[:2]
+
+    block_height = height // N
+    block_width = width // N
+    blocks = []
+
+    for i in range(N):
+        for j in range(N):
+            block = image[i * block_height:(i + 1) * block_height,
+                          j * block_width:(j + 1) * block_width]
+            blocks.append(block)
+
+    np.random.shuffle(blocks)
+    shuffled_image = np.zeros((height, width, 3), dtype=np.uint8)
+
+    for i in range(N):
+        for j in range(N):
+            shuffled_image[i * block_height:(i + 1) * block_height,
+                           j * block_width:(j + 1)
+                           * block_width] = blocks[i * N + j]
+    return shuffled_image
+
+
+def get_mosaic_mask(image, fg_mask, N=16, ratio=0.5):
+    ids = [i for i in range(N * N)]
+    masked_number = int(N * N * ratio)
+    masked_id = np.random.choice(ids, masked_number, replace=False)
+
+    height, width = image.shape[:2]
+    mask = np.ones((height, width))
+
+    block_height = height // N
+    block_width = width // N
+
+    b_id = 0
+    for i in range(N):
+        for j in range(N):
+            if b_id in masked_id:
+                mask[i * block_height:(i + 1) * block_height,
+                     j * block_width:(j + 1)
+                     * block_width] = mask[i * block_height:(i + 1)
+                                           * block_height, j * block_width:
+                                           (j + 1) * block_width] * 0
+            b_id += 1
+    mask = mask * fg_mask
+    mask3 = np.stack([mask, mask, mask], -1).copy().astype(np.uint8)
+    noise = q_x(image)
+    noise_mask = image * mask3 + noise * (1 - mask3)
+    return noise_mask
+
+
+def extract_canny_noise(image, mask, dilate=True):
+    h, w = image.shape[0], image.shape[1]
+    mask = cv2.resize(mask.astype(np.uint8), (w, h)) > 0.5
+    kernel = np.ones((8, 8), dtype=np.uint8)
+    mask = cv2.erode(mask.astype(np.uint8), kernel, 10)
+
+    canny = cv2.Canny(image, 50, 100) * mask
+    kernel = np.ones((8, 8), dtype=np.uint8)
+    mask = (cv2.dilate(canny, kernel, 5) > 128).astype(np.uint8)
+    mask = np.stack([mask, mask, mask], -1)
+
+    pure_noise = q_x(image, t=1) * 0 + 255
+    canny_noise = mask * image + (1 - mask) * pure_noise
+    return canny_noise
+
+
+def get_random_structure(size):
+    choice = np.random.randint(1, 5)
+
+    if choice == 1:
+        return cv2.getStructuringElement(cv2.MORPH_RECT, (size, size))
+    elif choice == 2:
+        return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size, size))
+    elif choice == 3:
+        return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size, size // 2))
+    elif choice == 4:
+        return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size // 2, size))
+
+
+def random_dilate(seg, min=3, max=10):
+    size = np.random.randint(min, max)
+    kernel = get_random_structure(size)
+    seg = cv2.dilate(seg, kernel, iterations=1)
+    return seg
+
+
+def random_erode(seg, min=3, max=10):
+    size = np.random.randint(min, max)
+    kernel = get_random_structure(size)
+    seg = cv2.erode(seg, kernel, iterations=1)
+    return seg
+
+
+def compute_iou(seg, gt):
+    intersection = seg * gt
+    union = seg + gt
+    return (np.count_nonzero(intersection) + 1e-6) / (
+        np.count_nonzero(union) + 1e-6)
+
+
+def select_max_region(mask):
+    nums, labels, stats, centroids = cv2.connectedComponentsWithStats(
+        mask, connectivity=8)
+    background = 0
+    for row in range(stats.shape[0]):
+        if stats[row, :][0] == 0 and stats[row, :][1] == 0:
+            background = row
+    stats_no_bg = np.delete(stats, background, axis=0)
+    max_idx = stats_no_bg[:, 4].argmax()
+    max_region = np.where(labels == max_idx + 1, 1, 0)
+
+    return max_region.astype(np.uint8)
+
+
+def perturb_mask(gt, min_iou=0.3, max_iou=0.99):
+    iou_target = np.random.uniform(min_iou, max_iou)
+    h, w = gt.shape
+    gt = gt.astype(np.uint8)
+    seg = gt.copy()
+
+    # Rare case
+    if h <= 2 or w <= 2:
+        print('GT too small, returning original')
+        return seg
+
+    # Do a bunch of random operations
+    for _ in range(250):
+        for _ in range(4):
+            lx, ly = np.random.randint(w), np.random.randint(h)
+            lw, lh = np.random.randint(lx + 1, w + 1), np.random.randint(
+                ly + 1, h + 1)
+
+            # Randomly set one pixel to 1/0. With the following dilate/erode, we can create holes/external regions
+            if np.random.rand() < 0.1:
+                cx = int((lx + lw) / 2)
+                cy = int((ly + lh) / 2)
+                seg[cy, cx] = np.random.randint(2) * 255
+
+            # Dilate/erode
+            if np.random.rand() < 0.5:
+                seg[ly:lh, lx:lw] = random_dilate(seg[ly:lh, lx:lw])
+            else:
+                seg[ly:lh, lx:lw] = random_erode(seg[ly:lh, lx:lw])
+
+            seg = np.logical_or(seg, gt).astype(np.uint8)
+            # seg = select_max_region(seg)
+
+        if compute_iou(seg, gt) < iou_target:
+            break
+    seg = select_max_region(seg.astype(np.uint8))
+    return seg.astype(np.uint8)
+
+
+def q_x(x_0, t=65):
+    '''Adding noise for and given image.'''
+    x_0 = torch.from_numpy(x_0).float() / 127.5 - 1
+    num_steps = 100
+
+    betas = torch.linspace(-6, 6, num_steps)
+    betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5
+
+    alphas = 1 - betas
+    alphas_prod = torch.cumprod(alphas, 0)
+
+    alphas_bar_sqrt = torch.sqrt(alphas_prod)
+    one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)
+
+    noise = torch.randn_like(x_0)
+    alphas_t = alphas_bar_sqrt[t]
+    alphas_1_m_t = one_minus_alphas_bar_sqrt[t]
+    return (alphas_t * x_0 + alphas_1_m_t * noise).numpy() * 127.5 + 127.5
+
+
+def extract_target_boundary(img, target_mask):
+    Ksize = 3
+    sobelx = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=Ksize)
+    sobely = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=Ksize)
+
+    # sobel-x
+    sobel_X = cv2.convertScaleAbs(sobelx)
+    # sobel-y
+    sobel_Y = cv2.convertScaleAbs(sobely)
+    # sobel-xy
+    scharr = cv2.addWeighted(sobel_X, 0.5, sobel_Y, 0.5, 0)
+    scharr = np.max(scharr, -1).astype(np.float32) / 255
+    scharr = scharr * target_mask.astype(np.float32)
+    return scharr
diff --git a/modelscope/models/cv/anydoor/dinov2/__init__.py b/modelscope/models/cv/anydoor/dinov2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/__init__.py b/modelscope/models/cv/anydoor/dinov2/dinov2/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/__init__.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/__init__.py
new file mode 100644
index 000000000..daadf5eb3
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .attention import MemEffAttention
+from .block import NestedTensorBlock
+from .dino_head import DINOHead
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/attention.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/attention.py
new file mode 100644
index 000000000..2efee7368
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/attention.py
@@ -0,0 +1,86 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+
+from torch import Tensor, nn
+
+logger = logging.getLogger('dinov2')
+
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning('xFormers not available')
+    XFORMERS_AVAILABLE = False
+
+
+class Attention(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class MemEffAttention(Attention):
+
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, 'xFormers is required for nested tensors usage'
+            return super().forward(x)
+
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+
+        q, k, v = unbind(qkv, 2)
+
+        if attn_bias is not None:
+            self_att_op = fmha.MemoryEfficientAttentionFlashAttentionOp
+        else:
+            self_att_op = None
+        x = memory_efficient_attention(
+            q, k, v, attn_bias=attn_bias, op=self_att_op)
+        x = x.reshape([B, N, C])
+
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/block.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/block.py
new file mode 100644
index 000000000..f9f1f9caf
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/block.py
@@ -0,0 +1,286 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+import logging
+from typing import Any, Callable, Dict, List, Tuple
+
+import torch
+from torch import Tensor, nn
+
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+
+logger = logging.getLogger('dinov2')
+
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning('xFormers not available')
+    XFORMERS_AVAILABLE = False
+
+
+class Block(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(
+            dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(
+            dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(
+            drop_path) if drop_path > 0.0 else nn.Identity()
+
+        self.sample_drop_ratio = drop_path
+
+    def forward(self, x: Tensor) -> Tensor:
+
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+
+
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+
+    residual_scale_factor = b / sample_subset_size
+
+    # 3) add the residual
+    x_plus_residual = torch.index_add(
+        x_flat,
+        0,
+        brange,
+        residual.to(dtype=x.dtype),
+        alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+
+
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+
+
+def add_residual(x,
+                 brange,
+                 residual,
+                 residual_scale_factor,
+                 scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(
+            x_flat,
+            0,
+            brange,
+            residual.to(dtype=x.dtype),
+            alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x,
+            brange,
+            residual.to(dtype=x.dtype),
+            scaling=scaling_vector,
+            alpha=residual_scale_factor)
+    return x_plus_residual
+
+
+attn_bias_cache: Dict[Tuple, Any] = {}
+
+
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges
+                   ] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list],
+                                       branges).view(1, -1,
+                                                     x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+
+    return attn_bias_cache[all_shapes], cat_tensors
+
+
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [
+        get_branges_scales(x, sample_drop_ratio=sample_drop_ratio)
+        for x in x_list
+    ]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(
+        x_cat, attn_bias=attn_bias))  # type: ignore
+
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(
+            x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(
+            add_residual(x, brange, residual, residual_scale_factor,
+                         scaling_vector).view_as(x))
+    return outputs
+
+
+class NestedTensorBlock(Block):
+
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+
+        if self.training and self.sample_drop_ratio > 0.0:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(
+                    self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(
+                    self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, 'Please install xFormers for nested tensors usage'
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/dino_head.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/dino_head.py
new file mode 100644
index 000000000..72a21386f
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/dino_head.py
@@ -0,0 +1,72 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+from torch.nn.init import trunc_normal_
+from torch.nn.utils import weight_norm
+
+
+class DINOHead(nn.Module):
+
+    def __init__(
+        self,
+        in_dim,
+        out_dim,
+        use_bn=False,
+        nlayers=3,
+        hidden_dim=2048,
+        bottleneck_dim=256,
+        mlp_bias=True,
+    ):
+        super().__init__()
+        nlayers = max(nlayers, 1)
+        self.mlp = _build_mlp(
+            nlayers,
+            in_dim,
+            bottleneck_dim,
+            hidden_dim=hidden_dim,
+            use_bn=use_bn,
+            bias=mlp_bias)
+        self.apply(self._init_weights)
+        self.last_layer = weight_norm(
+            nn.Linear(bottleneck_dim, out_dim, bias=False))
+        self.last_layer.weight_g.data.fill_(1)
+
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.mlp(x)
+        eps = 1e-6 if x.dtype == torch.float16 else 1e-12
+        x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
+        x = self.last_layer(x)
+        return x
+
+
+def _build_mlp(nlayers,
+               in_dim,
+               bottleneck_dim,
+               hidden_dim=None,
+               use_bn=False,
+               bias=True):
+    if nlayers == 1:
+        return nn.Linear(in_dim, bottleneck_dim, bias=bias)
+    else:
+        layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
+        if use_bn:
+            layers.append(nn.BatchNorm1d(hidden_dim))
+        layers.append(nn.GELU())
+        for _ in range(nlayers - 2):
+            layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
+            if use_bn:
+                layers.append(nn.BatchNorm1d(hidden_dim))
+            layers.append(nn.GELU())
+        layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
+        return nn.Sequential(*layers)
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/drop_path.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/drop_path.py
new file mode 100644
index 000000000..d28930e1e
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/drop_path.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+
+from torch import nn
+
+
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (
+        x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/layer_scale.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/layer_scale.py
new file mode 100644
index 000000000..c84e741a1
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/layer_scale.py
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Union
+
+import torch
+from torch import Tensor, nn
+
+
+class LayerScale(nn.Module):
+
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/mlp.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/mlp.py
new file mode 100644
index 000000000..68a286b73
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/mlp.py
@@ -0,0 +1,41 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+
+from typing import Callable, Optional
+
+from torch import Tensor, nn
+
+
+class Mlp(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/patch_embed.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/patch_embed.py
new file mode 100644
index 000000000..ec5aa7521
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/patch_embed.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+
+from typing import Callable, Optional, Tuple, Union
+
+import torch.nn as nn
+from torch import Tensor
+
+
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+
+    assert isinstance(x, int)
+    return (x, x)
+
+
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.flatten_embedding = flatten_embedding
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+
+        assert H % patch_H == 0, f'Input image height {H} is not a multiple of patch height {patch_H}'
+        assert W % patch_W == 0, f'Input image width {W} is not a multiple of patch width: {patch_W}'
+
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (
+            self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/swiglu_ffn.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/swiglu_ffn.py
new file mode 100644
index 000000000..b6c593f7a
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/swiglu_ffn.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Optional
+
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class SwiGLUFFN(nn.Module):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+
+
+try:
+    from xformers.ops import SwiGLU
+
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+
+
+class SwiGLUFFNFused(SwiGLU):
+
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/models/__init__.py b/modelscope/models/cv/anydoor/dinov2/dinov2/models/__init__.py
new file mode 100644
index 000000000..4d8b4118a
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/models/__init__.py
@@ -0,0 +1,43 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+
+from . import vision_transformer as vits
+
+logger = logging.getLogger('dinov2')
+
+
+def build_model(args, only_teacher=False, img_size=224):
+    args.arch = args.arch.removesuffix('_memeff')
+    if 'vit' in args.arch:
+        vit_kwargs = dict(
+            img_size=img_size,
+            patch_size=args.patch_size,
+            init_values=args.layerscale,
+            ffn_layer=args.ffn_layer,
+            block_chunks=args.block_chunks,
+            qkv_bias=args.qkv_bias,
+            proj_bias=args.proj_bias,
+            ffn_bias=args.ffn_bias,
+        )
+        teacher = vits.__dict__[args.arch](**vit_kwargs)
+        if only_teacher:
+            return teacher, teacher.embed_dim
+        student = vits.__dict__[args.arch](
+            **vit_kwargs,
+            drop_path_rate=args.drop_path_rate,
+            drop_path_uniform=args.drop_path_uniform,
+        )
+        embed_dim = student.embed_dim
+    return student, teacher, embed_dim
+
+
+def build_model_from_cfg(cfg, only_teacher=False):
+    return build_model(
+        cfg.student,
+        only_teacher=only_teacher,
+        img_size=cfg.crops.global_crops_size)
diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/models/vision_transformer.py b/modelscope/models/cv/anydoor/dinov2/dinov2/models/vision_transformer.py
new file mode 100644
index 000000000..2c9c6ec96
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/dinov2/models/vision_transformer.py
@@ -0,0 +1,390 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+
+import logging
+import math
+from functools import partial
+from typing import Callable, Sequence, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+
+from ..layers import MemEffAttention, Mlp
+from ..layers import NestedTensorBlock as Block
+from ..layers import PatchEmbed, SwiGLUFFNFused
+
+logger = logging.getLogger('dinov2')
+
+
+def named_apply(fn: Callable,
+                module: nn.Module,
+                name='',
+                depth_first=True,
+                include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = '.'.join((name, child_name)) if name else child_name
+        named_apply(
+            fn=fn,
+            module=child_module,
+            name=child_name,
+            depth_first=depth_first,
+            include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+
+
+class BlockChunk(nn.ModuleList):
+
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+
+
+class DinoVisionTransformer(nn.Module):
+
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer='mlp',
+        block_chunks=1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)
+                   ]  # stochastic depth decay rule
+
+        if ffn_layer == 'mlp':
+            logger.info('using MLP layer as FFN')
+            ffn_layer = Mlp
+        elif ffn_layer == 'swiglufused' or ffn_layer == 'swiglu':
+            logger.info('using SwiGLU layer as FFN')
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == 'identity':
+            logger.info('using Identity layer as FFN')
+
+            def f(*args, **kwargs):
+                return nn.Identity()
+
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            ) for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i
+                                      + blocks_list[i:i + chunksize])
+            self.blocks = nn.ModuleList(
+                [BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+
+        self.init_weights()
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        w0, h0 = w0 + 0.1, h0 + 0.1
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)),
+                                    dim).permute(0, 3, 1, 2),
+            scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)),
+            mode='bicubic',
+        )
+
+        assert int(w0) == patch_pos_embed.shape[-2] and int(
+            h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed),
+                         dim=1).to(previous_dtype)
+
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(
+                masks.unsqueeze(-1),
+                self.mask_token.to(x.dtype).unsqueeze(0), x)
+
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+
+        return x
+
+    def forward_features_list(self, x_list, masks_list):
+        x = [
+            self.prepare_tokens_with_masks(x, masks)
+            for x, masks in zip(x_list, masks_list)
+        ]
+        for blk in self.blocks:
+            x = blk(x)
+
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append({
+                'x_norm_clstoken': x_norm[:, 0],
+                'x_norm_patchtokens': x_norm[:, 1:],
+                'x_prenorm': x,
+                'masks': masks,
+            })
+        return output
+
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+
+        x = self.prepare_tokens_with_masks(x, masks)
+
+        for blk in self.blocks:
+            x = blk(x)
+
+        x_norm = self.norm(x)
+        return {
+            'x_norm_clstoken': x_norm[:, 0],
+            'x_norm_patchtokens': x_norm[:, 1:],
+            'x_prenorm': x,
+            'masks': masks,
+        }
+
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n,
+                               total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(
+            blocks_to_take
+        ), f'only {len(output)} / {len(blocks_to_take)} blocks found'
+        return output
+
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n,
+                               total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(
+            blocks_to_take
+        ), f'only {len(output)} / {len(blocks_to_take)} blocks found'
+        return output
+
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size,
+                            -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret['x_norm_clstoken'])
+
+
+def init_weights_vit_timm(module: nn.Module, name: str = ''):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+
+
+def vit_small(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_base(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_large(patch_size=16, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
+
+
+def vit_giant2(patch_size=16, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        **kwargs,
+    )
+    return model
diff --git a/modelscope/models/cv/anydoor/dinov2/hubconf.py b/modelscope/models/cv/anydoor/dinov2/hubconf.py
new file mode 100644
index 000000000..42660f64e
--- /dev/null
+++ b/modelscope/models/cv/anydoor/dinov2/hubconf.py
@@ -0,0 +1,195 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+dependencies = ['torch']
+
+_DINOV2_BASE_URL = 'https://dl.fbaipublicfiles.com/dinov2'
+
+
+def _make_dinov2_model_name(arch_name: str, patch_size: int) -> str:
+    compact_arch_name = arch_name.replace('_', '')[:4]
+    return f'dinov2_{compact_arch_name}{patch_size}'
+
+
+def _make_dinov2_model(
+    *,
+    arch_name: str = 'vit_large',
+    img_size: int = 518,
+    patch_size: int = 14,
+    init_values: float = 1.0,
+    ffn_layer: str = 'mlp',
+    block_chunks: int = 0,
+    pretrained: bool = True,
+    **kwargs,
+):
+    from .dinov2.models import vision_transformer as vits
+
+    _ = _make_dinov2_model_name(arch_name, patch_size)
+    vit_kwargs = dict(
+        img_size=img_size,
+        patch_size=patch_size,
+        init_values=init_values,
+        ffn_layer=ffn_layer,
+        block_chunks=block_chunks,
+    )
+    vit_kwargs.update(**kwargs)
+    model = vits.__dict__[arch_name](**vit_kwargs)
+
+    # if pretrained:
+    #     state_dict = torch.load('')
+    #     model.load_state_dict(state_dict, strict=False)
+    return model
+
+
+def dinov2_vits14(*, pretrained: bool = True, **kwargs):
+    """
+    DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name='vit_small', pretrained=pretrained, **kwargs)
+
+
+def dinov2_vitb14(*, pretrained: bool = True, **kwargs):
+    """
+    DINOv2 ViT-B/14 model pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name='vit_base', pretrained=pretrained, **kwargs)
+
+
+def dinov2_vitl14(*, pretrained: bool = True, **kwargs):
+    """
+    DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name='vit_large', pretrained=pretrained, **kwargs)
+
+
+def dinov2_vitg14(*, pretrained: bool = True, **kwargs):
+    """
+    DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset.
+    """
+    return _make_dinov2_model(
+        arch_name='vit_giant2',
+        ffn_layer='swiglufused',
+        pretrained=pretrained,
+        **kwargs)
+
+
+def _make_dinov2_linear_head(
+    *,
+    model_name: str = 'dinov2_vitl14',
+    embed_dim: int = 1024,
+    layers: int = 4,
+    pretrained: bool = True,
+    **kwargs,
+):
+    assert layers in (1, 4), f'Unsupported number of layers: {layers}'
+    linear_head = nn.Linear((1 + layers) * embed_dim, 1_000)
+
+    if pretrained:
+        layers_str = str(layers) if layers == 4 else ''
+        url = _DINOV2_BASE_URL + f'/{model_name}/{model_name}_linear{layers_str}_head.pth'
+        state_dict = torch.hub.load_state_dict_from_url(
+            url, map_location='cpu')
+        linear_head.load_state_dict(state_dict, strict=False)
+
+    return linear_head
+
+
+class _LinearClassifierWrapper(nn.Module):
+
+    def __init__(self,
+                 *,
+                 backbone: nn.Module,
+                 linear_head: nn.Module,
+                 layers: int = 4):
+        super().__init__()
+        self.backbone = backbone
+        self.linear_head = linear_head
+        self.layers = layers
+
+    def forward(self, x):
+        if self.layers == 1:
+            x = self.backbone.forward_features(x)
+            cls_token = x['x_norm_clstoken'].squeeze(0)
+            patch_tokens = x['x_norm_patchtokens'].squeeze(0)
+            linear_input = torch.cat([cls_token, patch_tokens.mean(0)])
+        elif self.layers == 4:
+            x = self.backbone.get_intermediate_layers(
+                x, n=4, return_class_token=True)
+            linear_input = torch.cat([
+                x[0][1].squeeze(0), x[1][1].squeeze(0), x[2][1].squeeze(0),
+                x[3][1].squeeze(0), x[3][0].squeeze(0).mean(0)
+            ])
+        else:
+            assert False, f'Unsupported number of layers: {self.layers}'
+        return self.linear_head(linear_input)
+
+
+def _make_dinov2_linear_classifier(
+    *,
+    arch_name: str = 'vit_large',
+    layers: int = 4,
+    pretrained: bool = True,
+    **kwargs,
+):
+    backbone = _make_dinov2_model(
+        arch_name=arch_name, pretrained=pretrained, **kwargs)
+
+    embed_dim = backbone.embed_dim
+    patch_size = backbone.patch_size
+    model_name = _make_dinov2_model_name(arch_name, patch_size)
+    linear_head = _make_dinov2_linear_head(
+        model_name=model_name,
+        embed_dim=embed_dim,
+        layers=layers,
+        pretrained=pretrained)
+
+    return _LinearClassifierWrapper(
+        backbone=backbone, linear_head=linear_head, layers=layers)
+
+
+def dinov2_vits14_lc(*, layers: int = 4, pretrained: bool = True, **kwargs):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally)
+    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name='vit_small', layers=layers, pretrained=pretrained, **kwargs)
+
+
+def dinov2_vitb14_lc(*, pretrained: bool = True, **kwargs):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally)
+    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name='vit_base', pretrained=pretrained, **kwargs)
+
+
+def dinov2_vitl14_lc(*, pretrained: bool = True, **kwargs):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally)
+    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name='vit_large', pretrained=pretrained, **kwargs)
+
+
+def dinov2_vitg14_lc(*, pretrained: bool = True, **kwargs):
+    """
+    Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally)
+    pretrained on the LVD-142M dataset and trained on ImageNet-1k.
+    """
+    return _make_dinov2_linear_classifier(
+        arch_name='vit_giant2',
+        ffn_layer='swiglufused',
+        pretrained=pretrained,
+        **kwargs)
diff --git a/modelscope/models/cv/anydoor/ldm/__init__.py b/modelscope/models/cv/anydoor/ldm/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/models/__init__.py b/modelscope/models/cv/anydoor/ldm/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/models/autoencoder.py b/modelscope/models/cv/anydoor/ldm/models/autoencoder.py
new file mode 100644
index 000000000..cfa91c1eb
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/models/autoencoder.py
@@ -0,0 +1,274 @@
+from contextlib import contextmanager
+
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+
+from ...ldm.modules.diffusionmodules.model import Decoder, Encoder
+from ...ldm.modules.distributions.distributions import \
+    DiagonalGaussianDistribution
+from ...ldm.modules.ema import LitEma
+from ...ldm.util import instantiate_from_config
+
+
+class AutoencoderKL(pl.LightningModule):
+
+    def __init__(self,
+                 ddconfig,
+                 lossconfig,
+                 embed_dim,
+                 ckpt_path=None,
+                 ignore_keys=[],
+                 image_key='image',
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ema_decay=None,
+                 learn_logvar=False):
+        super().__init__()
+        self.learn_logvar = learn_logvar
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig['double_z']
+        self.quant_conv = torch.nn.Conv2d(2 * ddconfig['z_channels'],
+                                          2 * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim,
+                                               ddconfig['z_channels'], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels) == int
+            self.register_buffer('colorize',
+                                 torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+
+        self.use_ema = ema_decay is not None
+        if self.use_ema:
+            self.ema_decay = ema_decay
+            assert 0. < ema_decay < 1.
+            self.model_ema = LitEma(self, decay=ema_decay)
+            print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
+
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location='cpu')['state_dict']
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print('Deleting key {} from state_dict.'.format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f'Restored from {path}')
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f'{context}: Switched to EMA weights')
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f'{context}: Restored training weights')
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1,
+                      2).to(memory_format=torch.contiguous_format).float()
+        return x
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(
+                inputs,
+                reconstructions,
+                posterior,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train')
+            self.log(
+                'aeloss',
+                aeloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True)
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=False)
+            return aeloss
+
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(
+                inputs,
+                reconstructions,
+                posterior,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train')
+
+            self.log(
+                'discloss',
+                discloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True)
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=False)
+            return discloss
+
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        with self.ema_scope():
+            _ = self._validation_step(batch, batch_idx, postfix='_ema')
+        return log_dict
+
+    def _validation_step(self, batch, batch_idx, postfix=''):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(
+            inputs,
+            reconstructions,
+            posterior,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val' + postfix)
+
+        discloss, log_dict_disc = self.loss(
+            inputs,
+            reconstructions,
+            posterior,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val' + postfix)
+
+        self.log(f'val{postfix}/rec_loss',
+                 log_dict_ae[f'val{postfix}/rec_loss'])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        ae_params_list = list(self.encoder.parameters()) + list(
+            self.decoder.parameters()) + list(
+                self.quant_conv.parameters()) + list(
+                    self.post_quant_conv.parameters())
+        if self.learn_logvar:
+            print(f'{self.__class__.__name__}: Learning logvar')
+            ae_params_list.append(self.loss.logvar)
+        opt_ae = torch.optim.Adam(ae_params_list, lr=lr, betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log['samples'] = self.decode(torch.randn_like(posterior.sample()))
+            log['reconstructions'] = xrec
+            if log_ema or self.use_ema:
+                with self.ema_scope():
+                    xrec_ema, posterior_ema = self(x)
+                    if x.shape[1] > 3:
+                        # colorize with random projection
+                        assert xrec_ema.shape[1] > 3
+                        xrec_ema = self.to_rgb(xrec_ema)
+                    log['samples_ema'] = self.decode(
+                        torch.randn_like(posterior_ema.sample()))
+                    log['reconstructions_ema'] = xrec_ema
+        log['inputs'] = x
+        return log
+
+    def to_rgb(self, x):
+        assert self.image_key == 'segmentation'
+        if not hasattr(self, 'colorize'):
+            self.register_buffer('colorize',
+                                 torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
+        return x
+
+
+class IdentityFirstStage(torch.nn.Module):
+
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface
+        super().__init__()
+
+    def encode(self, x, *args, **kwargs):
+        return x
+
+    def decode(self, x, *args, **kwargs):
+        return x
+
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+
+    def forward(self, x, *args, **kwargs):
+        return x
diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/__init__.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/ddim.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddim.py
new file mode 100644
index 000000000..53a98fc73
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddim.py
@@ -0,0 +1,446 @@
+"""SAMPLING ONLY."""
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from ....ldm.modules.diffusionmodules.util import (
+    extract_into_tensor, make_ddim_sampling_parameters, make_ddim_timesteps,
+    noise_like)
+
+
+class DDIMSampler(object):
+
+    def __init__(self, model, schedule='linear', **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device('cuda'):
+                attr = attr.to(torch.device('cuda'))
+        setattr(self, name, attr)
+
+    def make_schedule(self,
+                      ddim_num_steps,
+                      ddim_discretize='uniform',
+                      ddim_eta=0.,
+                      verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[
+            0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+
+        def to_torch(x):
+            return x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(self.model.alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas',
+                             np.sqrt(1. - ddim_alphas))
+        tmp1 = (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod)
+        tmp2 = (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(tmp1 * tmp2)
+        self.register_buffer('ddim_sigmas_for_original_num_steps',
+                             sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               dynamic_threshold=None,
+               ucg_schedule=None,
+               **kwargs):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                ctmp = conditioning[list(conditioning.keys())[0]]
+                while isinstance(ctmp, list):
+                    ctmp = ctmp[0]
+                cbs = ctmp.shape[0]
+                if cbs != batch_size:
+                    print(
+                        f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                    )
+
+            elif isinstance(conditioning, list):
+                for ctmp in conditioning:
+                    if ctmp.shape[0] != batch_size:
+                        print(
+                            f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                        )
+
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(
+                        f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
+                    )
+
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+
+        samples, intermediates = self.ddim_sampling(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            dynamic_threshold=dynamic_threshold,
+            ucg_schedule=ucg_schedule)
+        return samples, intermediates
+
+    @torch.no_grad()
+    def ddim_sampling(self,
+                      cond,
+                      shape,
+                      x_T=None,
+                      ddim_use_original_steps=False,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None,
+                      x0=None,
+                      img_callback=None,
+                      log_every_t=100,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      dynamic_threshold=None,
+                      ucg_schedule=None):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(
+                min(timesteps / self.ddim_timesteps.shape[0], 1)
+                * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(
+            0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[
+            0]
+        print(f'Running DDIM Sampling with {total_steps} timesteps')
+
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b, ), step, device=device, dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(
+                    x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            if ucg_schedule is not None:
+                assert len(ucg_schedule) == len(time_range)
+                unconditional_guidance_scale = ucg_schedule[i]
+
+            outs = self.p_sample_ddim(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                dynamic_threshold=dynamic_threshold)
+            img, pred_x0 = outs
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_ddim(self,
+                      x,
+                      c,
+                      t,
+                      index,
+                      repeat_noise=False,
+                      use_original_steps=False,
+                      quantize_denoised=False,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      dynamic_threshold=None):
+        b, *_, device = *x.shape, x.device
+
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            model_output = self.model.apply_model(x, t, c)
+        else:
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            if isinstance(c, dict):
+                assert isinstance(unconditional_conditioning, dict)
+                c_in = dict()
+                for k in c:
+                    if isinstance(c[k], list):
+                        c_in[k] = [
+                            torch.cat(
+                                [unconditional_conditioning[k][i], c[k][i]])
+                            for i in range(len(c[k]))
+                        ]
+                    else:
+                        c_in[k] = torch.cat(
+                            [unconditional_conditioning[k], c[k]])
+            elif isinstance(c, list):
+                c_in = list()
+                assert isinstance(unconditional_conditioning, list)
+                for i in range(len(c)):
+                    c_in.append(
+                        torch.cat([unconditional_conditioning[i], c[i]]))
+            else:
+                c_in = torch.cat([unconditional_conditioning, c])
+            model_uncond, model_t = self.model.apply_model(x_in, t_in,
+                                                           c_in).chunk(2)
+            model_output = model_uncond + unconditional_guidance_scale * (
+                model_t - model_uncond)
+
+        if self.model.parameterization == 'v':
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
+
+        if score_corrector is not None:
+            assert self.model.parameterization == 'eps', 'not implemented'
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c,
+                                               **corrector_kwargs)
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod \
+            if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1),
+                                       sqrt_one_minus_alphas[index],
+                                       device=device)
+
+        # current prediction for x_0
+        if self.model.parameterization != 'v':
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
+
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+
+        if dynamic_threshold is not None:
+            raise NotImplementedError()
+
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device,
+                                     repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0
+
+    @torch.no_grad()
+    def encode(self,
+               x0,
+               c,
+               t_enc,
+               use_original_steps=False,
+               return_intermediates=None,
+               unconditional_guidance_scale=1.0,
+               unconditional_conditioning=None,
+               callback=None):
+        num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[
+            0]
+
+        assert t_enc <= num_reference_steps
+        num_steps = t_enc
+
+        if use_original_steps:
+            alphas_next = self.alphas_cumprod[:num_steps]
+            alphas = self.alphas_cumprod_prev[:num_steps]
+        else:
+            alphas_next = self.ddim_alphas[:num_steps]
+            alphas = torch.tensor(self.ddim_alphas_prev[:num_steps])
+
+        x_next = x0
+        intermediates = []
+        inter_steps = []
+        for i in tqdm(range(num_steps), desc='Encoding Image'):
+            t = torch.full((x0.shape[0], ),
+                           i,
+                           device=self.model.device,
+                           dtype=torch.long)
+            if unconditional_guidance_scale == 1.:
+                noise_pred = self.model.apply_model(x_next, t, c)
+            else:
+                assert unconditional_conditioning is not None
+                e_t_uncond, noise_pred = torch.chunk(
+                    self.model.apply_model(
+                        torch.cat((x_next, x_next)), torch.cat((t, t)),
+                        torch.cat((unconditional_conditioning, c))), 2)
+                tmp = noise_pred - e_t_uncond
+                noise_pred = e_t_uncond + unconditional_guidance_scale * tmp
+
+            xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next
+            tmp = (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()
+            weighted_noise_pred = alphas_next[i].sqrt() * tmp * noise_pred
+            x_next = xt_weighted + weighted_noise_pred
+            if return_intermediates and i % (num_steps // return_intermediates
+                                             ) == 0 and i < num_steps - 1:
+                intermediates.append(x_next)
+                inter_steps.append(i)
+            elif return_intermediates and i >= num_steps - 2:
+                intermediates.append(x_next)
+                inter_steps.append(i)
+            if callback:
+                callback(i)
+
+        out = {'x_encoded': x_next, 'intermediate_steps': inter_steps}
+        if return_intermediates:
+            out.update({'intermediates': intermediates})
+        return x_next, out
+
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return (
+            extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0
+            + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape)
+            * noise)
+
+    @torch.no_grad()
+    def decode(self,
+               x_latent,
+               cond,
+               t_start,
+               unconditional_guidance_scale=1.0,
+               unconditional_conditioning=None,
+               use_original_steps=False,
+               callback=None):
+
+        timesteps = np.arange(self.ddpm_num_timesteps
+                              ) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f'Running DDIM Sampling with {total_steps} timesteps')
+
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0], ),
+                            step,
+                            device=x_latent.device,
+                            dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(
+                x_dec,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=use_original_steps,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning)
+            if callback:
+                callback(i)
+        return x_dec
diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/ddpm.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddpm.py
new file mode 100644
index 000000000..78faa630e
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddpm.py
@@ -0,0 +1,2295 @@
+"""
+wild mixture of
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py
+https://github.com/CompVis/taming-transformers
+-- merci
+"""
+
+import itertools
+import os
+from contextlib import contextmanager, nullcontext
+from functools import partial
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from omegaconf import ListConfig
+from pytorch_lightning.utilities.distributed import rank_zero_only
+from torch.optim.lr_scheduler import LambdaLR
+from torchvision.utils import make_grid
+from tqdm import tqdm
+
+from ....ldm.models.autoencoder import AutoencoderKL, IdentityFirstStage
+from ....ldm.models.diffusion.ddim import DDIMSampler
+from ....ldm.modules.diffusionmodules.util import (extract_into_tensor,
+                                                   make_beta_schedule,
+                                                   noise_like)
+from ....ldm.modules.distributions.distributions import (
+    DiagonalGaussianDistribution, normal_kl)
+from ....ldm.modules.ema import LitEma
+from ....ldm.util import (count_params, default, exists,
+                          instantiate_from_config, isimage, ismap,
+                          log_txt_as_img, mean_flat)
+
+__conditioning_keys__ = {
+    'concat': 'c_concat',
+    'crossattn': 'c_crossattn',
+    'adm': 'y'
+}
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+def uniform_on_device(r1, r2, shape, device):
+    return (r1 - r2) * torch.rand(*shape, device=device) + r2
+
+
+class DDPM(pl.LightningModule):
+    # classic DDPM with Gaussian diffusion, in image space
+    def __init__(
+            self,
+            unet_config,
+            timesteps=1000,
+            beta_schedule='linear',
+            loss_type='l2',
+            ckpt_path=None,
+            ignore_keys=[],
+            load_only_unet=False,
+            monitor='val/loss',
+            use_ema=True,
+            first_stage_key='image',
+            image_size=256,
+            channels=3,
+            log_every_t=100,
+            clip_denoised=True,
+            linear_start=1e-4,
+            linear_end=2e-2,
+            cosine_s=8e-3,
+            given_betas=None,
+            original_elbo_weight=0.,
+            v_posterior=0.,  # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta
+            l_simple_weight=1.,
+            conditioning_key=None,
+            parameterization='eps',  # all assuming fixed variance schedules
+            scheduler_config=None,
+            use_positional_encodings=False,
+            learn_logvar=False,
+            logvar_init=0.,
+            make_it_fit=False,
+            ucg_training=None,
+            reset_ema=False,
+            reset_num_ema_updates=False,
+            **kwargs):
+        super().__init__()
+        assert parameterization in [
+            'eps', 'x0', 'v'
+        ], 'currently only supporting "eps" and "x0" and "v"'
+        self.parameterization = parameterization
+        print(
+            f'{self.__class__.__name__}: Running in {self.parameterization}-prediction mode'
+        )
+        self.cond_stage_model = None
+        self.clip_denoised = clip_denoised
+        self.log_every_t = log_every_t
+        self.first_stage_key = first_stage_key
+        self.image_size = image_size  # try conv?
+        self.channels = channels
+        self.use_positional_encodings = use_positional_encodings
+        self.model = DiffusionWrapper(unet_config, conditioning_key)
+        count_params(self.model, verbose=True)
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self.model)
+            print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
+
+        self.use_scheduler = scheduler_config is not None
+        if self.use_scheduler:
+            self.scheduler_config = scheduler_config
+
+        self.v_posterior = v_posterior
+        self.original_elbo_weight = original_elbo_weight
+        self.l_simple_weight = l_simple_weight
+
+        if monitor is not None:
+            self.monitor = monitor
+        self.make_it_fit = make_it_fit
+        if reset_ema:
+            assert exists(ckpt_path)
+        if ckpt_path is not None:
+            self.init_from_ckpt(
+                ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet)
+            if reset_ema:
+                assert self.use_ema
+                print(
+                    'Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.'
+                )
+                self.model_ema = LitEma(self.model)
+        if reset_num_ema_updates:
+            print(
+                ' +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ '
+            )
+            assert self.use_ema
+            self.model_ema.reset_num_updates()
+
+        self.register_schedule(
+            given_betas=given_betas,
+            beta_schedule=beta_schedule,
+            timesteps=timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s)
+
+        self.loss_type = loss_type
+
+        self.learn_logvar = learn_logvar
+        logvar = torch.full(
+            fill_value=logvar_init, size=(self.num_timesteps, ))
+        if self.learn_logvar:
+            self.logvar = nn.Parameter(self.logvar, requires_grad=True)
+        else:
+            self.register_buffer('logvar', logvar)
+
+        self.ucg_training = ucg_training or dict()
+        if self.ucg_training:
+            self.ucg_prng = np.random.RandomState()
+
+    def register_schedule(self,
+                          given_betas=None,
+                          beta_schedule='linear',
+                          timesteps=1000,
+                          linear_start=1e-4,
+                          linear_end=2e-2,
+                          cosine_s=8e-3):
+        if exists(given_betas):
+            betas = given_betas
+        else:
+            betas = make_beta_schedule(
+                beta_schedule,
+                timesteps,
+                linear_start=linear_start,
+                linear_end=linear_end,
+                cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[
+            0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (1 - self.v_posterior) * betas * (
+            1. - alphas_cumprod_prev) / (
+                1. - alphas_cumprod) + self.v_posterior * betas
+        # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t)
+        self.register_buffer('posterior_variance',
+                             to_torch(posterior_variance))
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.register_buffer(
+            'posterior_log_variance_clipped',
+            to_torch(np.log(np.maximum(posterior_variance, 1e-20))))
+        tmp = betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod)
+        self.register_buffer('posterior_mean_coef1', to_torch(tmp))
+        tmp = (1. - alphas_cumprod_prev) * np.sqrt(alphas)
+        self.register_buffer('posterior_mean_coef2',
+                             to_torch(tmp / (1. - alphas_cumprod)))
+
+        if self.parameterization == 'eps':
+            tmp = 2 * self.posterior_variance * to_torch(alphas)
+            lvlb_weights = self.betas**2 / (tmp * (1 - self.alphas_cumprod))
+        elif self.parameterization == 'x0':
+            lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / (
+                2. * 1 - torch.Tensor(alphas_cumprod))
+        elif self.parameterization == 'v':
+            tmp = 2 * self.posterior_variance * to_torch(alphas)
+            tmp = self.betas**2 (tmp * (1 - self.alphas_cumprod))
+            lvlb_weights = torch.ones_like(tmp)
+        else:
+            raise NotImplementedError('mu not supported')
+        lvlb_weights[0] = lvlb_weights[1]
+        self.register_buffer('lvlb_weights', lvlb_weights, persistent=False)
+        assert not torch.isnan(self.lvlb_weights).all()
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.model.parameters())
+            self.model_ema.copy_to(self.model)
+            if context is not None:
+                print(f'{context}: Switched to EMA weights')
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.model.parameters())
+                if context is not None:
+                    print(f'{context}: Restored training weights')
+
+    @torch.no_grad()
+    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+        sd = torch.load(path, map_location='cpu')
+        if 'state_dict' in list(sd.keys()):
+            sd = sd['state_dict']
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print('Deleting key {} from state_dict.'.format(k))
+                    del sd[k]
+        if self.make_it_fit:
+            n_params = len([
+                name for name, _ in itertools.chain(self.named_parameters(),
+                                                    self.named_buffers())
+            ])
+            for name, param in tqdm(
+                    itertools.chain(self.named_parameters(),
+                                    self.named_buffers()),
+                    desc='Fitting old weights to new weights',
+                    total=n_params):
+                if name not in sd:
+                    continue
+                old_shape = sd[name].shape
+                new_shape = param.shape
+                assert len(old_shape) == len(new_shape)
+                if len(new_shape) > 2:
+                    # we only modify first two axes
+                    assert new_shape[2:] == old_shape[2:]
+                # assumes first axis corresponds to output dim
+                if not new_shape == old_shape:
+                    new_param = param.clone()
+                    old_param = sd[name]
+                    if len(new_shape) == 1:
+                        for i in range(new_param.shape[0]):
+                            new_param[i] = old_param[i % old_shape[0]]
+                    elif len(new_shape) >= 2:
+                        for i in range(new_param.shape[0]):
+                            for j in range(new_param.shape[1]):
+                                new_param[i, j] = old_param[i % old_shape[0],
+                                                            j % old_shape[1]]
+
+                        n_used_old = torch.ones(old_shape[1])
+                        for j in range(new_param.shape[1]):
+                            n_used_old[j % old_shape[1]] += 1
+                        n_used_new = torch.zeros(new_shape[1])
+                        for j in range(new_param.shape[1]):
+                            n_used_new[j] = n_used_old[j % old_shape[1]]
+
+                        n_used_new = n_used_new[None, :]
+                        while len(n_used_new.shape) < len(new_shape):
+                            n_used_new = n_used_new.unsqueeze(-1)
+                        new_param /= n_used_new
+
+                    sd[name] = new_param
+
+        missing, unexpected = self.load_state_dict(
+            sd,
+            strict=False) if not only_model else self.model.load_state_dict(
+                sd, strict=False)
+        print(
+            f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+        )
+        if len(missing) > 0:
+            print(f'Missing Keys:\n {missing}')
+        if len(unexpected) > 0:
+            print(f'\nUnexpected Keys:\n {unexpected}')
+
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = (
+            extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape)
+            * x_start)
+        variance = extract_into_tensor(1.0 - self.alphas_cumprod, t,
+                                       x_start.shape)
+        log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod,
+                                           t, x_start.shape)
+        return mean, variance, log_variance
+
+    def predict_start_from_noise(self, x_t, t, noise):
+        return (
+            extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape)
+            * x_t - extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t,
+                                        x_t.shape) * noise)
+
+    def predict_start_from_z_and_v(self, x_t, t, v):
+        # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+        # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+        return (
+            extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t
+            - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t,
+                                  x_t.shape) * v)
+
+    def predict_eps_from_z_and_v(self, x_t, t, v):
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v
+                + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t,
+                                      x_t.shape) * x_t)
+
+    def q_posterior(self, x_start, x_t, t):
+        tmp1 = extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape)
+        tmp2 = extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape)
+        posterior_mean = (tmp1 * x_start + tmp2 * x_t)
+        posterior_variance = extract_into_tensor(self.posterior_variance, t,
+                                                 x_t.shape)
+        posterior_log_variance_clipped = extract_into_tensor(
+            self.posterior_log_variance_clipped, t, x_t.shape)
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(self, x, t, clip_denoised: bool):
+        model_out = self.model(x, t)
+        if self.parameterization == 'eps':
+            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
+        elif self.parameterization == 'x0':
+            x_recon = model_out
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(
+            x_start=x_recon, x_t=x, t=t)
+        return model_mean, posterior_variance, posterior_log_variance
+
+    @torch.no_grad()
+    def p_sample(self, x, t, clip_denoised=True, repeat_noise=False):
+        b, *_, device = *x.shape, x.device
+        model_mean, _, model_log_variance = self.p_mean_variance(
+            x=x, t=t, clip_denoised=clip_denoised)
+        noise = noise_like(x.shape, device, repeat_noise)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(
+            b, *((1, ) * (len(x.shape) - 1)))
+        return model_mean + nonzero_mask * (0.5
+                                            * model_log_variance).exp() * noise
+
+    @torch.no_grad()
+    def p_sample_loop(self, shape, return_intermediates=False):
+        device = self.betas.device
+        b = shape[0]
+        img = torch.randn(shape, device=device)
+        intermediates = [img]
+        for i in tqdm(
+                reversed(range(0, self.num_timesteps)),
+                desc='Sampling t',
+                total=self.num_timesteps):
+            img = self.p_sample(
+                img,
+                torch.full((b, ), i, device=device, dtype=torch.long),
+                clip_denoised=self.clip_denoised)
+            if i % self.log_every_t == 0 or i == self.num_timesteps - 1:
+                intermediates.append(img)
+        if return_intermediates:
+            return img, intermediates
+        return img
+
+    @torch.no_grad()
+    def sample(self, batch_size=16, return_intermediates=False):
+        image_size = self.image_size
+        channels = self.channels
+        return self.p_sample_loop(
+            (batch_size, channels, image_size, image_size),
+            return_intermediates=return_intermediates)
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape)
+                * x_start
+                + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t,
+                                      x_start.shape) * noise)
+
+    def get_v(self, x, noise, t):
+        tmp1 = extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape)
+        tmp2 = extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t,
+                                   x.shape)
+        return (tmp1 * noise - tmp2 * x)
+
+    def get_loss(self, pred, target, mean=True):
+        if self.loss_type == 'l1':
+            loss = (target - pred).abs()
+            if mean:
+                loss = loss.mean()
+        elif self.loss_type == 'l2':
+            if mean:
+                loss = torch.nn.functional.mse_loss(target, pred)
+            else:
+                loss = torch.nn.functional.mse_loss(
+                    target, pred, reduction='none')
+        else:
+            raise NotImplementedError("unknown loss type '{loss_type}'")
+
+        return loss
+
+    def p_losses(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_out = self.model(x_noisy, t)
+
+        loss_dict = {}
+        if self.parameterization == 'eps':
+            target = noise
+        elif self.parameterization == 'x0':
+            target = x_start
+        elif self.parameterization == 'v':
+            target = self.get_v(x_start, noise, t)
+        else:
+            raise NotImplementedError(
+                f'Parameterization {self.parameterization} not yet supported')
+
+        loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3])
+
+        log_prefix = 'train' if self.training else 'val'
+
+        loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()})
+        loss_simple = loss.mean() * self.l_simple_weight
+
+        loss_vlb = (self.lvlb_weights[t] * loss).mean()
+        loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb})
+
+        loss = loss_simple + self.original_elbo_weight * loss_vlb
+
+        loss_dict.update({f'{log_prefix}/loss': loss})
+
+        return loss, loss_dict
+
+    def forward(self, x, *args, **kwargs):
+        # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size
+        # assert h == img_size and w == img_size, f'height and width of image must be {img_size}'
+        t = torch.randint(
+            0, self.num_timesteps, (x.shape[0], ), device=self.device).long()
+        return self.p_losses(x, t, *args, **kwargs)
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = rearrange(x, 'b h w c -> b c h w')
+        x = x.to(memory_format=torch.contiguous_format).float()
+        return x
+
+    def shared_step(self, batch):
+        x = self.get_input(batch, self.first_stage_key)
+        loss, loss_dict = self(x)
+        return loss, loss_dict
+
+    def training_step(self, batch, batch_idx):
+        for k in self.ucg_training:
+            p = self.ucg_training[k]['p']
+            val = self.ucg_training[k]['val']
+            if val is None:
+                val = ''
+            for i in range(len(batch[k])):
+                if self.ucg_prng.choice(2, p=[1 - p, p]):
+                    batch[k][i] = val
+
+        loss, loss_dict = self.shared_step(batch)
+
+        self.log_dict(
+            loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True)
+
+        self.log(
+            'global_step',
+            self.global_step,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False)
+
+        if self.use_scheduler:
+            lr = self.optimizers().param_groups[0]['lr']
+            self.log(
+                'lr_abs',
+                lr,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=False)
+
+        return loss
+
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        _, loss_dict_no_ema = self.shared_step(batch)
+        with self.ema_scope():
+            _, loss_dict_ema = self.shared_step(batch)
+            loss_dict_ema = {
+                key + '_ema': loss_dict_ema[key]
+                for key in loss_dict_ema
+            }
+        self.log_dict(
+            loss_dict_no_ema,
+            prog_bar=False,
+            logger=True,
+            on_step=False,
+            on_epoch=True)
+        self.log_dict(
+            loss_dict_ema,
+            prog_bar=False,
+            logger=True,
+            on_step=False,
+            on_epoch=True)
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self.model)
+
+    def _get_rows_from_list(self, samples):
+        n_imgs_per_row = len(samples)
+        denoise_grid = rearrange(samples, 'n b c h w -> b n c h w')
+        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
+        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
+        return denoise_grid
+
+    @torch.no_grad()
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=2,
+                   sample=True,
+                   return_keys=None,
+                   **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.first_stage_key)
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        x = x.to(self.device)[:N]
+        log['inputs'] = x
+
+        # get diffusion row
+        diffusion_row = list()
+        x_start = x[:n_row]
+
+        for t in range(self.num_timesteps):
+            if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                t = t.to(self.device).long()
+                noise = torch.randn_like(x_start)
+                x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+                diffusion_row.append(x_noisy)
+
+        log['diffusion_row'] = self._get_rows_from_list(diffusion_row)
+
+        if sample:
+            # get denoise row
+            with self.ema_scope('Plotting'):
+                samples, denoise_row = self.sample(
+                    batch_size=N, return_intermediates=True)
+
+            log['samples'] = samples
+            log['denoise_row'] = self._get_rows_from_list(denoise_row)
+
+        if return_keys:
+            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
+                return log
+            else:
+                return {key: log[key] for key in return_keys}
+        return log
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        if self.learn_logvar:
+            params = params + [self.logvar]
+        opt = torch.optim.AdamW(params, lr=lr)
+        return opt
+
+
+class LatentDiffusion(DDPM):
+    """main class"""
+
+    def __init__(self,
+                 first_stage_config,
+                 cond_stage_config,
+                 num_timesteps_cond=None,
+                 cond_stage_key='image',
+                 cond_stage_trainable=False,
+                 concat_mode=True,
+                 cond_stage_forward=None,
+                 conditioning_key=None,
+                 scale_factor=1.0,
+                 scale_by_std=False,
+                 force_null_conditioning=False,
+                 *args,
+                 **kwargs):
+        self.model_dir = kwargs.get('model_dir')
+        self.force_null_conditioning = force_null_conditioning
+        self.num_timesteps_cond = default(num_timesteps_cond, 1)
+        self.scale_by_std = scale_by_std
+        assert self.num_timesteps_cond <= kwargs['timesteps']
+        # for backwards compatibility after implementation of DiffusionWrapper
+        if conditioning_key is None:
+            conditioning_key = 'concat' if concat_mode else 'crossattn'
+        if cond_stage_config == '__is_unconditional__' and not self.force_null_conditioning:
+            conditioning_key = None
+        ckpt_path = kwargs.pop('ckpt_path', None)
+        reset_ema = kwargs.pop('reset_ema', False)
+        reset_num_ema_updates = kwargs.pop('reset_num_ema_updates', False)
+        ignore_keys = kwargs.pop('ignore_keys', [])
+        super().__init__(conditioning_key=conditioning_key, *args, **kwargs)
+        self.concat_mode = concat_mode
+        self.cond_stage_trainable = cond_stage_trainable
+        self.cond_stage_key = cond_stage_key
+        try:
+            self.num_downs = len(
+                first_stage_config.params.ddconfig.ch_mult) - 1
+        except Exception:
+            self.num_downs = 0
+        if not scale_by_std:
+            self.scale_factor = scale_factor
+        else:
+            self.register_buffer('scale_factor', torch.tensor(scale_factor))
+        self.instantiate_first_stage(first_stage_config)
+        self.instantiate_cond_stage(cond_stage_config)
+        self.cond_stage_forward = cond_stage_forward
+        self.clip_denoised = False
+        self.bbox_tokenizer = None
+
+        self.restarted_from_ckpt = False
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys)
+            self.restarted_from_ckpt = True
+            if reset_ema:
+                assert self.use_ema
+                print(
+                    'Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.'
+                )
+                self.model_ema = LitEma(self.model)
+        if reset_num_ema_updates:
+            print(
+                ' +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ '
+            )
+            assert self.use_ema
+            self.model_ema.reset_num_updates()
+
+    def make_cond_schedule(self, ):
+        self.cond_ids = torch.full(
+            size=(self.num_timesteps, ),
+            fill_value=self.num_timesteps - 1,
+            dtype=torch.long)
+        ids = torch.round(
+            torch.linspace(0, self.num_timesteps - 1,
+                           self.num_timesteps_cond)).long()
+        self.cond_ids[:self.num_timesteps_cond] = ids
+
+    @rank_zero_only
+    @torch.no_grad()
+    def on_train_batch_start(self, batch, batch_idx, dataloader_idx):
+        # only for very first batch
+        if (self.scale_by_std and self.current_epoch == 0
+                and self.global_step == 0 and batch_idx == 0
+                and not self.restarted_from_ckpt):
+            assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously'
+            # set rescale weight to 1./std of encodings
+            print('### USING STD-RESCALING ###')
+            x = super().get_input(batch, self.first_stage_key)
+            x = x.to(self.device)
+            encoder_posterior = self.encode_first_stage(x)
+            z = self.get_first_stage_encoding(encoder_posterior).detach()
+            del self.scale_factor
+            self.register_buffer('scale_factor', 1. / z.flatten().std())
+            print(f'setting self.scale_factor to {self.scale_factor}')
+            print('### USING STD-RESCALING ###')
+
+    def register_schedule(self,
+                          given_betas=None,
+                          beta_schedule='linear',
+                          timesteps=1000,
+                          linear_start=1e-4,
+                          linear_end=2e-2,
+                          cosine_s=8e-3):
+        super().register_schedule(given_betas, beta_schedule, timesteps,
+                                  linear_start, linear_end, cosine_s)
+
+        self.shorten_cond_schedule = self.num_timesteps_cond > 1
+        if self.shorten_cond_schedule:
+            self.make_cond_schedule()
+
+    def instantiate_first_stage(self, config):
+        model = instantiate_from_config(config)
+        self.first_stage_model = model.eval()
+        self.first_stage_model.train = disabled_train
+        for param in self.first_stage_model.parameters():
+            param.requires_grad = False
+
+    def instantiate_cond_stage(self, config):
+        config.params.model_path = os.path.join(self.model_dir,
+                                                config.params.model_path)
+        if not self.cond_stage_trainable:
+            if config == '__is_first_stage__':
+                print('Using first stage also as cond stage.')
+                self.cond_stage_model = self.first_stage_model
+            elif config == '__is_unconditional__':
+                print(
+                    f'Training {self.__class__.__name__} as an unconditional model.'
+                )
+                self.cond_stage_model = None
+                # self.be_unconditional = True
+            else:
+                model = instantiate_from_config(config)
+                self.cond_stage_model = model.eval()
+                self.cond_stage_model.train = disabled_train
+                for param in self.cond_stage_model.parameters():
+                    param.requires_grad = False
+        else:
+            assert config != '__is_first_stage__'
+            assert config != '__is_unconditional__'
+            model = instantiate_from_config(config)
+            self.cond_stage_model = model
+
+    def _get_denoise_row_from_list(self,
+                                   samples,
+                                   desc='',
+                                   force_no_decoder_quantization=False):
+        denoise_row = []
+        for zd in tqdm(samples, desc=desc):
+            denoise_row.append(
+                self.decode_first_stage(
+                    zd.to(self.device),
+                    force_not_quantize=force_no_decoder_quantization))
+        n_imgs_per_row = len(denoise_row)
+        denoise_row = torch.stack(denoise_row)  # n_log_step, n_row, C, H, W
+        denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w')
+        denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w')
+        denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row)
+        return denoise_grid
+
+    def get_first_stage_encoding(self, encoder_posterior):
+        if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+            z = encoder_posterior.sample()
+        elif isinstance(encoder_posterior, torch.Tensor):
+            z = encoder_posterior
+        else:
+            raise NotImplementedError(
+                f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented"
+            )
+        return self.scale_factor * z
+
+    def get_learned_conditioning(self, c):
+        # c 1,3,224,224
+        if self.cond_stage_forward is None:
+            if hasattr(self.cond_stage_model, 'encode') and callable(
+                    self.cond_stage_model.encode):
+                # 1,1,1024
+                c = self.cond_stage_model.encode(c)
+                if isinstance(c, DiagonalGaussianDistribution):
+                    c = c.mode()
+            else:
+                c = self.cond_stage_model(c)
+        else:
+            assert hasattr(self.cond_stage_model, self.cond_stage_forward)
+            c = getattr(self.cond_stage_model, self.cond_stage_forward)(c)
+        return c
+
+    def meshgrid(self, h, w):
+        y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1)
+        x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1)
+
+        arr = torch.cat([y, x], dim=-1)
+        return arr
+
+    def delta_border(self, h, w):
+        """
+        :param h: height
+        :param w: width
+        :return: normalized distance to image border,
+         wtith min distance = 0 at border and max dist = 0.5 at image center
+        """
+        lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2)
+        arr = self.meshgrid(h, w) / lower_right_corner
+        dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0]
+        dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0]
+        edge_dist = torch.min(
+            torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0]
+        return edge_dist
+
+    def get_weighting(self, h, w, Ly, Lx, device):
+        weighting = self.delta_border(h, w)
+        weighting = torch.clip(
+            weighting,
+            self.split_input_params['clip_min_weight'],
+            self.split_input_params['clip_max_weight'],
+        )
+        weighting = weighting.view(1, h * w, 1).repeat(1, 1,
+                                                       Ly * Lx).to(device)
+
+        if self.split_input_params['tie_braker']:
+            L_weighting = self.delta_border(Ly, Lx)
+            L_weighting = torch.clip(
+                L_weighting, self.split_input_params['clip_min_tie_weight'],
+                self.split_input_params['clip_max_tie_weight'])
+
+            L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device)
+            weighting = weighting * L_weighting
+        return weighting
+
+    def get_fold_unfold(self,
+                        x,
+                        kernel_size,
+                        stride,
+                        uf=1,
+                        df=1):  # todo load once not every time, shorten code
+        """
+        :param x: img of size (bs, c, h, w)
+        :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1])
+        """
+        bs, nc, h, w = x.shape
+
+        # number of crops in image
+        Ly = (h - kernel_size[0]) // stride[0] + 1
+        Lx = (w - kernel_size[1]) // stride[1] + 1
+
+        if uf == 1 and df == 1:
+            fold_params = dict(
+                kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
+            unfold = torch.nn.Unfold(**fold_params)
+
+            fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params)
+
+            weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly,
+                                           Lx, x.device).to(x.dtype)
+            normalization = fold(weighting).view(1, 1, h,
+                                                 w)  # normalizes the overlap
+            weighting = weighting.view(
+                (1, 1, kernel_size[0], kernel_size[1], Ly * Lx))
+
+        elif uf > 1 and df == 1:
+            fold_params = dict(
+                kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
+            unfold = torch.nn.Unfold(**fold_params)
+
+            fold_params2 = dict(
+                kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf),
+                dilation=1,
+                padding=0,
+                stride=(stride[0] * uf, stride[1] * uf))
+            fold = torch.nn.Fold(
+                output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2)
+
+            weighting = self.get_weighting(kernel_size[0] * uf,
+                                           kernel_size[1] * uf, Ly, Lx,
+                                           x.device).to(x.dtype)
+            normalization = fold(weighting).view(
+                1, 1, h * uf, w * uf)  # normalizes the overlap
+            weighting = weighting.view(
+                (1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx))
+
+        elif df > 1 and uf == 1:
+            fold_params = dict(
+                kernel_size=kernel_size, dilation=1, padding=0, stride=stride)
+            unfold = torch.nn.Unfold(**fold_params)
+
+            fold_params2 = dict(
+                kernel_size=(kernel_size[0] // df, kernel_size[0] // df),
+                dilation=1,
+                padding=0,
+                stride=(stride[0] // df, stride[1] // df))
+            fold = torch.nn.Fold(
+                output_size=(x.shape[2] // df, x.shape[3] // df),
+                **fold_params2)
+
+            weighting = self.get_weighting(kernel_size[0] // df,
+                                           kernel_size[1] // df, Ly, Lx,
+                                           x.device).to(x.dtype)
+            normalization = fold(weighting).view(
+                1, 1, h // df, w // df)  # normalizes the overlap
+            weighting = weighting.view(
+                (1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx))
+
+        else:
+            raise NotImplementedError
+
+        return fold, unfold, normalization, weighting
+
+    @torch.no_grad()
+    def get_input(self,
+                  batch,
+                  k,
+                  return_first_stage_outputs=False,
+                  force_c_encode=False,
+                  cond_key=None,
+                  return_original_cond=False,
+                  bs=None,
+                  return_x=False):
+        x = super().get_input(batch, k)
+        if bs is not None:
+            x = x[:bs]
+        x = x.to(self.device)
+        encoder_posterior = self.encode_first_stage(x)
+        z = self.get_first_stage_encoding(encoder_posterior).detach()
+
+        if self.model.conditioning_key is not None and not self.force_null_conditioning:
+            if cond_key is None:
+                cond_key = self.cond_stage_key
+            if cond_key != self.first_stage_key:
+                if cond_key in ['caption', 'coordinates_bbox', 'txt']:
+                    xc = batch[cond_key]
+                elif cond_key in ['class_label', 'cls']:
+                    xc = batch
+                else:
+                    xc = super().get_input(batch, cond_key).to(self.device)
+            else:
+                xc = x
+            if not self.cond_stage_trainable or force_c_encode:
+                if isinstance(xc, dict) or isinstance(xc, list):
+                    c = self.get_learned_conditioning(xc)
+                else:
+                    c = self.get_learned_conditioning(xc.to(self.device))
+            else:
+                c = xc
+            if bs is not None:
+                c = c[:bs]
+
+            if self.use_positional_encodings:
+                pos_x, pos_y = self.compute_latent_shifts(batch)
+                ckey = __conditioning_keys__[self.model.conditioning_key]
+                c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y}
+
+        else:
+            c = None
+            xc = None
+            if self.use_positional_encodings:
+                pos_x, pos_y = self.compute_latent_shifts(batch)
+                c = {'pos_x': pos_x, 'pos_y': pos_y}
+        out = [z, c]
+        if return_first_stage_outputs:
+            xrec = self.decode_first_stage(z)
+            out.extend([x, xrec])
+        if return_x:
+            out.extend([x])
+        if return_original_cond:
+            out.append(xc)
+        return out
+
+    @torch.no_grad()
+    def decode_first_stage(self,
+                           z,
+                           predict_cids=False,
+                           force_not_quantize=False):
+        if predict_cids:
+            if z.dim() == 4:
+                z = torch.argmax(z.exp(), dim=1).long()
+            z = self.first_stage_model.quantize.get_codebook_entry(
+                z, shape=None)
+            z = rearrange(z, 'b h w c -> b c h w').contiguous()
+
+        z = 1. / self.scale_factor * z
+        return self.first_stage_model.decode(z)
+
+    @torch.no_grad()
+    def encode_first_stage(self, x):
+        return self.first_stage_model.encode(x)
+
+    def shared_step(self, batch, **kwargs):
+        x, c = self.get_input(batch, self.first_stage_key)
+        loss = self(x, c)
+        return loss
+
+    def forward(self, x, c, *args, **kwargs):
+        # t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long()
+        t = self.time_steps.reshape((x.shape[0], )).to(self.device).long()
+
+        if self.model.conditioning_key is not None:
+            assert c is not None
+            if self.cond_stage_trainable:
+                c = self.get_learned_conditioning(c)
+            if self.shorten_cond_schedule:  # TODO: drop this option
+                tc = self.cond_ids[t].to(self.device)
+                c = self.q_sample(
+                    x_start=c, t=tc, noise=torch.randn_like(c.float()))
+        return self.p_losses(x, c, t, *args, **kwargs)
+
+    def apply_model(self, x_noisy, t, cond, return_ids=False):
+        if isinstance(cond, dict):
+            # hybrid case, cond is expected to be a dict
+            pass
+        else:
+            if not isinstance(cond, list):
+                cond = [cond]
+            key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn'
+            cond = {key: cond}
+
+        x_recon = self.model(x_noisy, t, **cond)
+
+        if isinstance(x_recon, tuple) and not return_ids:
+            return x_recon[0]
+        else:
+            return x_recon
+
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        tmp1 = extract_into_tensor(self.sqrt_recip_alphas_cumprod, t,
+                                   x_t.shape)
+        tmp2 = extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t,
+                                   x_t.shape)
+        return (tmp1 * x_t - pred_xstart) / tmp2
+
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = torch.tensor(
+            [self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(
+            mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
+        return mean_flat(kl_prior) / np.log(2.0)
+
+    def p_losses(self, x_start, cond, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise)
+        model_output = self.apply_model(x_noisy, t, cond)
+
+        loss_dict = {}
+        prefix = 'train' if self.training else 'val'
+
+        if self.parameterization == 'x0':
+            target = x_start
+        elif self.parameterization == 'eps':
+            target = noise
+        elif self.parameterization == 'v':
+            target = self.get_v(x_start, noise, t)
+        else:
+            raise NotImplementedError()
+
+        loss_simple = self.get_loss(model_output, target, mean=False)
+        # boundary = self.boundary.to(loss_simple.device)
+        # boundary = F.interpolate(boundary, size = (64,64)) * 5 + 1.0 #16,1,64,64
+
+        # print(loss_simple.shape) #16,4,64,64
+        loss_simple = loss_simple.mean([1, 2, 3])
+        # .mean([1, 2, 3])
+        loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()})
+
+        logvar_t = self.logvar[t].to(self.device)
+        loss = loss_simple / torch.exp(logvar_t) + logvar_t
+        # loss = loss_simple / torch.exp(self.logvar) + self.logvar
+        if self.learn_logvar:
+            loss_dict.update({f'{prefix}/loss_gamma': loss.mean()})
+            loss_dict.update({'logvar': self.logvar.data.mean()})
+
+        loss = self.l_simple_weight * loss.mean()
+
+        loss_vlb = self.get_loss(
+            model_output, target, mean=False).mean(dim=(1, 2, 3))
+        loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean()
+        loss_dict.update({f'{prefix}/loss_vlb': loss_vlb})
+        loss += (self.original_elbo_weight * loss_vlb)
+        loss_dict.update({f'{prefix}/loss': loss})
+
+        # print(self.parameterization, self.learn_logvar, self.original_elbo_weight, self.lvlb_weights[t])
+
+        return loss, loss_dict
+
+    def p_mean_variance(self,
+                        x,
+                        c,
+                        t,
+                        clip_denoised: bool,
+                        return_codebook_ids=False,
+                        quantize_denoised=False,
+                        return_x0=False,
+                        score_corrector=None,
+                        corrector_kwargs=None):
+        t_in = t
+        model_out = self.apply_model(
+            x, t_in, c, return_ids=return_codebook_ids)
+
+        if score_corrector is not None:
+            assert self.parameterization == 'eps'
+            model_out = score_corrector.modify_score(self, model_out, x, t, c,
+                                                     **corrector_kwargs)
+
+        if return_codebook_ids:
+            model_out, logits = model_out
+
+        if self.parameterization == 'eps':
+            x_recon = self.predict_start_from_noise(x, t=t, noise=model_out)
+        elif self.parameterization == 'x0':
+            x_recon = model_out
+        else:
+            raise NotImplementedError()
+
+        if clip_denoised:
+            x_recon.clamp_(-1., 1.)
+        if quantize_denoised:
+            x_recon, _, [_, _,
+                         indices] = self.first_stage_model.quantize(x_recon)
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(
+            x_start=x_recon, x_t=x, t=t)
+        if return_codebook_ids:
+            return model_mean, posterior_variance, posterior_log_variance, logits
+        elif return_x0:
+            return model_mean, posterior_variance, posterior_log_variance, x_recon
+        else:
+            return model_mean, posterior_variance, posterior_log_variance
+
+    @torch.no_grad()
+    def p_sample(self,
+                 x,
+                 c,
+                 t,
+                 clip_denoised=False,
+                 repeat_noise=False,
+                 return_codebook_ids=False,
+                 quantize_denoised=False,
+                 return_x0=False,
+                 temperature=1.,
+                 noise_dropout=0.,
+                 score_corrector=None,
+                 corrector_kwargs=None):
+        b, *_, device = *x.shape, x.device
+        outputs = self.p_mean_variance(
+            x=x,
+            c=c,
+            t=t,
+            clip_denoised=clip_denoised,
+            return_codebook_ids=return_codebook_ids,
+            quantize_denoised=quantize_denoised,
+            return_x0=return_x0,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs)
+        if return_codebook_ids:
+            raise DeprecationWarning('Support dropped.')
+            model_mean, _, model_log_variance, logits = outputs
+        elif return_x0:
+            model_mean, _, model_log_variance, x0 = outputs
+        else:
+            model_mean, _, model_log_variance = outputs
+
+        noise = noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        # no noise when t == 0
+        nonzero_mask = (1 - (t == 0).float()).reshape(
+            b, *((1, ) * (len(x.shape) - 1)))
+
+        if return_codebook_ids:
+            return model_mean + nonzero_mask * (
+                0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1)
+        if return_x0:
+            return model_mean + nonzero_mask * (
+                0.5 * model_log_variance).exp() * noise, x0
+        else:
+            return model_mean + nonzero_mask * (
+                0.5 * model_log_variance).exp() * noise
+
+    @torch.no_grad()
+    def progressive_denoising(self,
+                              cond,
+                              shape,
+                              verbose=True,
+                              callback=None,
+                              quantize_denoised=False,
+                              img_callback=None,
+                              mask=None,
+                              x0=None,
+                              temperature=1.,
+                              noise_dropout=0.,
+                              score_corrector=None,
+                              corrector_kwargs=None,
+                              batch_size=None,
+                              x_T=None,
+                              start_T=None,
+                              log_every_t=None):
+        if not log_every_t:
+            log_every_t = self.log_every_t
+        timesteps = self.num_timesteps
+        if batch_size is not None:
+            b = batch_size if batch_size is not None else shape[0]
+            shape = [batch_size] + list(shape)
+        else:
+            b = batch_size = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=self.device)
+        else:
+            img = x_T
+        intermediates = []
+        if cond is not None:
+            if isinstance(cond, dict):
+                cond = {
+                    key:
+                    cond[key][:batch_size] if not isinstance(cond[key], list)
+                    else list(map(lambda x: x[:batch_size], cond[key]))
+                    for key in cond
+                }
+            else:
+                cond = [c[:batch_size] for c in cond] if isinstance(
+                    cond, list) else cond[:batch_size]
+
+        if start_T is not None:
+            timesteps = min(timesteps, start_T)
+        iterator = tqdm(
+            reversed(range(0, timesteps)),
+            desc='Progressive Generation',
+            total=timesteps) if verbose else reversed(range(0, timesteps))
+        if type(temperature) == float:
+            temperature = [temperature] * timesteps
+
+        for i in iterator:
+            ts = torch.full((b, ), i, device=self.device, dtype=torch.long)
+            if self.shorten_cond_schedule:
+                assert self.model.conditioning_key != 'hybrid'
+                tc = self.cond_ids[ts].to(cond.device)
+                cond = self.q_sample(
+                    x_start=cond, t=tc, noise=torch.randn_like(cond))
+
+            img, x0_partial = self.p_sample(
+                img,
+                cond,
+                ts,
+                clip_denoised=self.clip_denoised,
+                quantize_denoised=quantize_denoised,
+                return_x0=True,
+                temperature=temperature[i],
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs)
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.q_sample(x0, ts)
+                img = img_orig * mask + (1. - mask) * img
+
+            if i % log_every_t == 0 or i == timesteps - 1:
+                intermediates.append(x0_partial)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(img, i)
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_loop(self,
+                      cond,
+                      shape,
+                      return_intermediates=False,
+                      x_T=None,
+                      verbose=True,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None,
+                      x0=None,
+                      img_callback=None,
+                      start_T=None,
+                      log_every_t=None):
+
+        if not log_every_t:
+            log_every_t = self.log_every_t
+        device = self.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        intermediates = [img]
+        if timesteps is None:
+            timesteps = self.num_timesteps
+
+        if start_T is not None:
+            timesteps = min(timesteps, start_T)
+        iterator = tqdm(
+            reversed(range(0, timesteps)), desc='Sampling t',
+            total=timesteps) if verbose else reversed(range(0, timesteps))
+
+        if mask is not None:
+            assert x0 is not None
+            assert x0.shape[2:3] == mask.shape[2:
+                                               3]  # spatial size has to match
+
+        for i in iterator:
+            ts = torch.full((b, ), i, device=device, dtype=torch.long)
+            if self.shorten_cond_schedule:
+                assert self.model.conditioning_key != 'hybrid'
+                tc = self.cond_ids[ts].to(cond.device)
+                cond = self.q_sample(
+                    x_start=cond, t=tc, noise=torch.randn_like(cond))
+
+            img = self.p_sample(
+                img,
+                cond,
+                ts,
+                clip_denoised=self.clip_denoised,
+                quantize_denoised=quantize_denoised)
+            if mask is not None:
+                img_orig = self.q_sample(x0, ts)
+                img = img_orig * mask + (1. - mask) * img
+
+            if i % log_every_t == 0 or i == timesteps - 1:
+                intermediates.append(img)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(img, i)
+
+        if return_intermediates:
+            return img, intermediates
+        return img
+
+    @torch.no_grad()
+    def sample(self,
+               cond,
+               batch_size=16,
+               return_intermediates=False,
+               x_T=None,
+               verbose=True,
+               timesteps=None,
+               quantize_denoised=False,
+               mask=None,
+               x0=None,
+               shape=None,
+               **kwargs):
+        if shape is None:
+            shape = (batch_size, self.channels, self.image_size,
+                     self.image_size)
+        if cond is not None:
+            if isinstance(cond, dict):
+                cond = {
+                    key:
+                    cond[key][:batch_size] if not isinstance(cond[key], list)
+                    else list(map(lambda x: x[:batch_size], cond[key]))
+                    for key in cond
+                }
+            else:
+                cond = [c[:batch_size] for c in cond] if isinstance(
+                    cond, list) else cond[:batch_size]
+        return self.p_sample_loop(
+            cond,
+            shape,
+            return_intermediates=return_intermediates,
+            x_T=x_T,
+            verbose=verbose,
+            timesteps=timesteps,
+            quantize_denoised=quantize_denoised,
+            mask=mask,
+            x0=x0)
+
+    @torch.no_grad()
+    def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs):
+        if ddim:
+            ddim_sampler = DDIMSampler(self)
+            shape = (self.channels, self.image_size, self.image_size)
+            samples, intermediates = ddim_sampler.sample(
+                ddim_steps, batch_size, shape, cond, verbose=False, **kwargs)
+
+        else:
+            samples, intermediates = self.sample(
+                cond=cond,
+                batch_size=batch_size,
+                return_intermediates=True,
+                **kwargs)
+
+        return samples, intermediates
+
+    @torch.no_grad()
+    def get_unconditional_conditioning(self, batch_size, null_label=None):
+        if null_label is not None:
+            xc = null_label
+            if isinstance(xc, ListConfig):
+                xc = list(xc)
+            if isinstance(xc, dict) or isinstance(xc, list):
+                c = self.get_learned_conditioning(xc)
+            else:
+                if hasattr(xc, 'to'):
+                    xc = xc.to(self.device)
+                c = self.get_learned_conditioning(xc)
+        else:
+            if self.cond_stage_key in ['class_label', 'cls']:
+                xc = self.cond_stage_model.get_unconditional_conditioning(
+                    batch_size, device=self.device)
+                return self.get_learned_conditioning(xc)
+            else:
+                raise NotImplementedError('todo')
+        if isinstance(c, list):  # in case the encoder gives us a list
+            for i in range(len(c)):
+                c[i] = repeat(
+                    c[i], '1 ... -> b ...', b=batch_size).to(self.device)
+        else:
+            c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device)
+        return c
+
+    @torch.no_grad()
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=4,
+                   sample=True,
+                   ddim_steps=50,
+                   ddim_eta=0.,
+                   return_keys=None,
+                   quantize_denoised=True,
+                   inpaint=True,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=True,
+                   unconditional_guidance_scale=1.,
+                   unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        ema_scope = self.ema_scope if use_ema_scope else nullcontext
+        use_ddim = ddim_steps is not None
+
+        log = dict()
+        z, c, x, xrec, xc = self.get_input(
+            batch,
+            self.first_stage_key,
+            return_first_stage_outputs=True,
+            force_c_encode=True,
+            return_original_cond=True,
+            bs=N)
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        log['inputs'] = x
+        log['reconstruction'] = xrec
+        if self.model.conditioning_key is not None:
+            if hasattr(self.cond_stage_model, 'decode'):
+                xc = self.cond_stage_model.decode(c)
+                log['conditioning'] = xc
+            elif self.cond_stage_key in ['caption', 'txt']:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]),
+                                    batch[self.cond_stage_key],
+                                    size=x.shape[2] // 25)
+                log['conditioning'] = xc
+            elif self.cond_stage_key in ['class_label', 'cls']:
+                try:
+                    xc = log_txt_as_img((x.shape[2], x.shape[3]),
+                                        batch['human_label'],
+                                        size=x.shape[2] // 25)
+                    log['conditioning'] = xc
+                except KeyError:
+                    # probably no "human_label" in batch
+                    pass
+            elif isimage(xc):
+                log['conditioning'] = xc
+            if ismap(xc):
+                log['original_conditioning'] = self.to_rgb(xc)
+
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+
+            diffusion_row = torch.stack(
+                diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid,
+                                       'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(
+                diffusion_grid, nrow=diffusion_row.shape[0])
+            log['diffusion_row'] = diffusion_grid
+
+        if sample:
+            # get denoise row
+            with ema_scope('Sampling'):
+                samples, z_denoise_row = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta)
+                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
+            x_samples = self.decode_first_stage(samples)
+            log['samples'] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log['denoise_row'] = denoise_grid
+
+            if quantize_denoised and not isinstance(
+                    self.first_stage_model, AutoencoderKL) and not isinstance(
+                        self.first_stage_model, IdentityFirstStage):
+                # also display when quantizing x0 while sampling
+                with ema_scope('Plotting Quantized Denoised'):
+                    samples, z_denoise_row = self.sample_log(
+                        cond=c,
+                        batch_size=N,
+                        ddim=use_ddim,
+                        ddim_steps=ddim_steps,
+                        eta=ddim_eta,
+                        quantize_denoised=True)
+                    # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True,
+                    #                                      quantize_denoised=True)
+                x_samples = self.decode_first_stage(samples.to(self.device))
+                log['samples_x0_quantized'] = x_samples
+
+        if unconditional_guidance_scale > 1.0:
+            uc = self.get_unconditional_conditioning(
+                N, unconditional_guidance_label)
+            if self.model.conditioning_key == 'crossattn-adm':
+                uc = {'c_crossattn': [uc], 'c_adm': c['c_adm']}
+            with ema_scope('Sampling with classifier-free guidance'):
+                samples_cfg, _ = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=uc,
+                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg
+
+        if inpaint:
+            # make a simple center square
+            _, h, w = z.shape[0], z.shape[2], z.shape[3]
+            mask = torch.ones(N, h, w).to(self.device)
+            # zeros will be filled in
+            mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0.
+            mask = mask[:, None, ...]
+            with ema_scope('Plotting Inpaint'):
+                samples, _ = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    eta=ddim_eta,
+                    ddim_steps=ddim_steps,
+                    x0=z[:N],
+                    mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log['samples_inpainting'] = x_samples
+            log['mask'] = mask
+
+            # outpaint
+            mask = 1. - mask
+            with ema_scope('Plotting Outpaint'):
+                samples, _ = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    eta=ddim_eta,
+                    ddim_steps=ddim_steps,
+                    x0=z[:N],
+                    mask=mask)
+            x_samples = self.decode_first_stage(samples.to(self.device))
+            log['samples_outpainting'] = x_samples
+
+        if plot_progressive_rows:
+            with ema_scope('Plotting Progressives'):
+                img, progressives = self.progressive_denoising(
+                    c,
+                    shape=(self.channels, self.image_size, self.image_size),
+                    batch_size=N)
+            prog_row = self._get_denoise_row_from_list(
+                progressives, desc='Progressive Generation')
+            log['progressive_row'] = prog_row
+
+        if return_keys:
+            if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0:
+                return log
+            else:
+                return {key: log[key] for key in return_keys}
+        return log
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        params = list(self.model.parameters())
+        if self.cond_stage_trainable:
+            print(
+                f'{self.__class__.__name__}: Also optimizing conditioner params!'
+            )
+            params = params + list(self.cond_stage_model.parameters())
+        if self.learn_logvar:
+            print('Diffusion model optimizing logvar')
+            params.append(self.logvar)
+        opt = torch.optim.AdamW(params, lr=lr)
+        if self.use_scheduler:
+            assert 'target' in self.scheduler_config
+            scheduler = instantiate_from_config(self.scheduler_config)
+
+            print('Setting up LambdaLR scheduler...')
+            scheduler = [{
+                'scheduler':
+                LambdaLR(opt, lr_lambda=scheduler.schedule),
+                'interval':
+                'step',
+                'frequency':
+                1
+            }]
+            return [opt], scheduler
+        return opt
+
+    @torch.no_grad()
+    def to_rgb(self, x):
+        x = x.float()
+        if not hasattr(self, 'colorize'):
+            self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x)
+        x = nn.functional.conv2d(x, weight=self.colorize)
+        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
+        return x
+
+
+class DiffusionWrapper(pl.LightningModule):
+
+    def __init__(self, diff_model_config, conditioning_key):
+        super().__init__()
+        self.sequential_cross_attn = diff_model_config.pop(
+            'sequential_crossattn', False)
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.conditioning_key = conditioning_key
+        assert self.conditioning_key in [
+            None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm',
+            'crossattn-adm'
+        ]
+
+    def forward(self,
+                x,
+                t,
+                c_concat: list = None,
+                c_crossattn: list = None,
+                c_adm=None):
+        if self.conditioning_key is None:
+            out = self.diffusion_model(x, t)
+        elif self.conditioning_key == 'concat':
+            xc = torch.cat([x] + c_concat, dim=1)
+            out = self.diffusion_model(xc, t)
+        elif self.conditioning_key == 'crossattn':
+            if not self.sequential_cross_attn:
+                cc = torch.cat(c_crossattn, 1)
+            else:
+                cc = c_crossattn
+            out = self.diffusion_model(x, t, context=cc)
+        elif self.conditioning_key == 'hybrid':
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc)
+        elif self.conditioning_key == 'hybrid-adm':
+            assert c_adm is not None
+            xc = torch.cat([x] + c_concat, dim=1)
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(xc, t, context=cc, y=c_adm)
+        elif self.conditioning_key == 'crossattn-adm':
+            assert c_adm is not None
+            cc = torch.cat(c_crossattn, 1)
+            out = self.diffusion_model(x, t, context=cc, y=c_adm)
+        elif self.conditioning_key == 'adm':
+            cc = c_crossattn[0]
+            out = self.diffusion_model(x, t, y=cc)
+        else:
+            raise NotImplementedError()
+
+        return out
+
+
+class LatentUpscaleDiffusion(LatentDiffusion):
+
+    def __init__(self,
+                 *args,
+                 low_scale_config,
+                 low_scale_key='LR',
+                 noise_level_key=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        # assumes that neither the cond_stage nor the low_scale_model contain trainable params
+        assert not self.cond_stage_trainable
+        self.instantiate_low_stage(low_scale_config)
+        self.low_scale_key = low_scale_key
+        self.noise_level_key = noise_level_key
+
+    def instantiate_low_stage(self, config):
+        model = instantiate_from_config(config)
+        self.low_scale_model = model.eval()
+        self.low_scale_model.train = disabled_train
+        for param in self.low_scale_model.parameters():
+            param.requires_grad = False
+
+    @torch.no_grad()
+    def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False):
+        if not log_mode:
+            z, c = super().get_input(batch, k, force_c_encode=True, bs=bs)
+        else:
+            z, c, x, xrec, xc = super().get_input(
+                batch,
+                self.first_stage_key,
+                return_first_stage_outputs=True,
+                force_c_encode=True,
+                return_original_cond=True,
+                bs=bs)
+        x_low = batch[self.low_scale_key][:bs]
+        x_low = rearrange(x_low, 'b h w c -> b c h w')
+        x_low = x_low.to(memory_format=torch.contiguous_format).float()
+        zx, noise_level = self.low_scale_model(x_low)
+        if self.noise_level_key is not None:
+            # get noise level from batch instead, e.g. when extracting a custom noise level for bsr
+            raise NotImplementedError('TODO')
+
+        all_conds = {
+            'c_concat': [zx],
+            'c_crossattn': [c],
+            'c_adm': noise_level
+        }
+        if log_mode:
+            # TODO: maybe disable if too expensive
+            x_low_rec = self.low_scale_model.decode(zx)
+            return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level
+        return z, all_conds
+
+    @torch.no_grad()
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=4,
+                   sample=True,
+                   ddim_steps=200,
+                   ddim_eta=1.,
+                   return_keys=None,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=True,
+                   unconditional_guidance_scale=1.,
+                   unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        ema_scope = self.ema_scope if use_ema_scope else nullcontext
+        use_ddim = ddim_steps is not None
+
+        log = dict()
+        z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input(
+            batch, self.first_stage_key, bs=N, log_mode=True)
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        log['inputs'] = x
+        log['reconstruction'] = xrec
+        log['x_lr'] = x_low
+        log[f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}"] = x_low_rec
+        if self.model.conditioning_key is not None:
+            if hasattr(self.cond_stage_model, 'decode'):
+                xc = self.cond_stage_model.decode(c)
+                log['conditioning'] = xc
+            elif self.cond_stage_key in ['caption', 'txt']:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]),
+                                    batch[self.cond_stage_key],
+                                    size=x.shape[2] // 25)
+                log['conditioning'] = xc
+            elif self.cond_stage_key in ['class_label', 'cls']:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]),
+                                    batch['human_label'],
+                                    size=x.shape[2] // 25)
+                log['conditioning'] = xc
+            elif isimage(xc):
+                log['conditioning'] = xc
+            if ismap(xc):
+                log['original_conditioning'] = self.to_rgb(xc)
+
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+
+            diffusion_row = torch.stack(
+                diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid,
+                                       'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(
+                diffusion_grid, nrow=diffusion_row.shape[0])
+            log['diffusion_row'] = diffusion_grid
+
+        if sample:
+            # get denoise row
+            with ema_scope('Sampling'):
+                samples, z_denoise_row = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta)
+                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
+            x_samples = self.decode_first_stage(samples)
+            log['samples'] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log['denoise_row'] = denoise_grid
+
+        if unconditional_guidance_scale > 1.0:
+            uc_tmp = self.get_unconditional_conditioning(
+                N, unconditional_guidance_label)
+            # TODO explore better "unconditional" choices for the other keys
+            # maybe guide away from empty text label and highest noise level and maximally degraded zx?
+            uc = dict()
+            for k in c:
+                if k == 'c_crossattn':
+                    assert isinstance(c[k], list) and len(c[k]) == 1
+                    uc[k] = [uc_tmp]
+                elif k == 'c_adm':  # todo: only run with text-based guidance?
+                    assert isinstance(c[k], torch.Tensor)
+                    # uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level
+                    uc[k] = c[k]
+                elif isinstance(c[k], list):
+                    uc[k] = [c[k][i] for i in range(len(c[k]))]
+                else:
+                    uc[k] = c[k]
+
+            with ema_scope('Sampling with classifier-free guidance'):
+                samples_cfg, _ = self.sample_log(
+                    cond=c,
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=uc,
+                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg
+
+        if plot_progressive_rows:
+            with ema_scope('Plotting Progressives'):
+                img, progressives = self.progressive_denoising(
+                    c,
+                    shape=(self.channels, self.image_size, self.image_size),
+                    batch_size=N)
+            prog_row = self._get_denoise_row_from_list(
+                progressives, desc='Progressive Generation')
+            log['progressive_row'] = prog_row
+
+        return log
+
+
+class LatentFinetuneDiffusion(LatentDiffusion):
+    """
+         Basis for different finetunas, such as inpainting or depth2image
+         To disable finetuning mode, set finetune_keys to None
+    """
+
+    def __init__(
+            self,
+            concat_keys: tuple,
+            finetune_keys=('model.diffusion_model.input_blocks.0.0.weight',
+                           'model_ema.diffusion_modelinput_blocks00weight'),
+            keep_finetune_dims=4,
+            # if model was trained without concat mode before and we would like to keep these channels
+            c_concat_log_start=None,  # to log reconstruction of c_concat codes
+            c_concat_log_end=None,
+            *args,
+            **kwargs):
+        ckpt_path = kwargs.pop('ckpt_path', None)
+        ignore_keys = kwargs.pop('ignore_keys', list())
+        super().__init__(*args, **kwargs)
+        self.finetune_keys = finetune_keys
+        self.concat_keys = concat_keys
+        self.keep_dims = keep_finetune_dims
+        self.c_concat_log_start = c_concat_log_start
+        self.c_concat_log_end = c_concat_log_end
+        if exists(self.finetune_keys):
+            assert exists(
+                ckpt_path), 'can only finetune from a given checkpoint'
+        if exists(ckpt_path):
+            self.init_from_ckpt(ckpt_path, ignore_keys)
+
+    def init_from_ckpt(self, path, ignore_keys=list(), only_model=False):
+        sd = torch.load(path, map_location='cpu')
+        if 'state_dict' in list(sd.keys()):
+            sd = sd['state_dict']
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print('Deleting key {} from state_dict.'.format(k))
+                    del sd[k]
+
+            # make it explicit, finetune by including extra input channels
+            if exists(self.finetune_keys) and k in self.finetune_keys:
+                new_entry = None
+                for name, param in self.named_parameters():
+                    if name in self.finetune_keys:
+                        print(
+                            f"modifying key '{name}' and keeping its "
+                            f'original {self.keep_dims} (channels) dimensions only'
+                        )
+                        new_entry = torch.zeros_like(param)  # zero init
+                assert exists(
+                    new_entry), 'did not find matching parameter to modify'
+                new_entry[:, :self.keep_dims, ...] = sd[k]
+                sd[k] = new_entry
+
+        missing, unexpected = self.load_state_dict(
+            sd,
+            strict=False) if not only_model else self.model.load_state_dict(
+                sd, strict=False)
+        print(
+            f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+        )
+        if len(missing) > 0:
+            print(f'Missing Keys: {missing}')
+        if len(unexpected) > 0:
+            print(f'Unexpected Keys: {unexpected}')
+
+    @torch.no_grad()
+    def log_images(self,
+                   batch,
+                   N=8,
+                   n_row=4,
+                   sample=True,
+                   ddim_steps=200,
+                   ddim_eta=1.,
+                   return_keys=None,
+                   quantize_denoised=True,
+                   inpaint=True,
+                   plot_denoise_rows=False,
+                   plot_progressive_rows=True,
+                   plot_diffusion_rows=True,
+                   unconditional_guidance_scale=1.,
+                   unconditional_guidance_label=None,
+                   use_ema_scope=True,
+                   **kwargs):
+        ema_scope = self.ema_scope if use_ema_scope else nullcontext
+        use_ddim = ddim_steps is not None
+
+        log = dict()
+        z, c, x, xrec, xc = self.get_input(
+            batch, self.first_stage_key, bs=N, return_first_stage_outputs=True)
+        c_cat, c = c['c_concat'][0], c['c_crossattn'][0]
+        N = min(x.shape[0], N)
+        n_row = min(x.shape[0], n_row)
+        log['inputs'] = x
+        log['reconstruction'] = xrec
+        if self.model.conditioning_key is not None:
+            if hasattr(self.cond_stage_model, 'decode'):
+                xc = self.cond_stage_model.decode(c)
+                log['conditioning'] = xc
+            elif self.cond_stage_key in ['caption', 'txt']:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]),
+                                    batch[self.cond_stage_key],
+                                    size=x.shape[2] // 25)
+                log['conditioning'] = xc
+            elif self.cond_stage_key in ['class_label', 'cls']:
+                xc = log_txt_as_img((x.shape[2], x.shape[3]),
+                                    batch['human_label'],
+                                    size=x.shape[2] // 25)
+                log['conditioning'] = xc
+            elif isimage(xc):
+                log['conditioning'] = xc
+            if ismap(xc):
+                log['original_conditioning'] = self.to_rgb(xc)
+
+        if not (self.c_concat_log_start is None
+                and self.c_concat_log_end is None):
+            log['c_concat_decoded'] = self.decode_first_stage(
+                c_cat[:, self.c_concat_log_start:self.c_concat_log_end])
+
+        if plot_diffusion_rows:
+            # get diffusion row
+            diffusion_row = list()
+            z_start = z[:n_row]
+            for t in range(self.num_timesteps):
+                if t % self.log_every_t == 0 or t == self.num_timesteps - 1:
+                    t = repeat(torch.tensor([t]), '1 -> b', b=n_row)
+                    t = t.to(self.device).long()
+                    noise = torch.randn_like(z_start)
+                    z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise)
+                    diffusion_row.append(self.decode_first_stage(z_noisy))
+
+            diffusion_row = torch.stack(
+                diffusion_row)  # n_log_step, n_row, C, H, W
+            diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w')
+            diffusion_grid = rearrange(diffusion_grid,
+                                       'b n c h w -> (b n) c h w')
+            diffusion_grid = make_grid(
+                diffusion_grid, nrow=diffusion_row.shape[0])
+            log['diffusion_row'] = diffusion_grid
+
+        if sample:
+            # get denoise row
+            with ema_scope('Sampling'):
+                samples, z_denoise_row = self.sample_log(
+                    cond={
+                        'c_concat': [c_cat],
+                        'c_crossattn': [c]
+                    },
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta)
+                # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True)
+            x_samples = self.decode_first_stage(samples)
+            log['samples'] = x_samples
+            if plot_denoise_rows:
+                denoise_grid = self._get_denoise_row_from_list(z_denoise_row)
+                log['denoise_row'] = denoise_grid
+
+        if unconditional_guidance_scale > 1.0:
+            uc_cross = self.get_unconditional_conditioning(
+                N, unconditional_guidance_label)
+            uc_cat = c_cat
+            uc_full = {'c_concat': [uc_cat], 'c_crossattn': [uc_cross]}
+            with ema_scope('Sampling with classifier-free guidance'):
+                samples_cfg, _ = self.sample_log(
+                    cond={
+                        'c_concat': [c_cat],
+                        'c_crossattn': [c]
+                    },
+                    batch_size=N,
+                    ddim=use_ddim,
+                    ddim_steps=ddim_steps,
+                    eta=ddim_eta,
+                    unconditional_guidance_scale=unconditional_guidance_scale,
+                    unconditional_conditioning=uc_full,
+                )
+                x_samples_cfg = self.decode_first_stage(samples_cfg)
+                log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg
+
+        return log
+
+
+class LatentInpaintDiffusion(LatentFinetuneDiffusion):
+    """
+    can either run as pure inpainting model (only concat mode) or with mixed conditionings,
+    e.g. mask as concat and text via cross-attn.
+    To disable finetuning mode, set finetune_keys to None
+     """
+
+    def __init__(self,
+                 concat_keys=('mask', 'masked_image'),
+                 masked_image_key='masked_image',
+                 *args,
+                 **kwargs):
+        super().__init__(concat_keys, *args, **kwargs)
+        self.masked_image_key = masked_image_key
+        assert self.masked_image_key in concat_keys
+
+    @torch.no_grad()
+    def get_input(self,
+                  batch,
+                  k,
+                  cond_key=None,
+                  bs=None,
+                  return_first_stage_outputs=False):
+        # note: restricted to non-trainable encoders currently
+        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting'
+        z, c, x, xrec, xc = super().get_input(
+            batch,
+            self.first_stage_key,
+            return_first_stage_outputs=True,
+            force_c_encode=True,
+            return_original_cond=True,
+            bs=bs)
+
+        assert exists(self.concat_keys)
+        c_cat = list()
+        for ck in self.concat_keys:
+            cc = rearrange(batch[ck], 'b h w c -> b c h w').to(
+                memory_format=torch.contiguous_format).float()
+            if bs is not None:
+                cc = cc[:bs]
+                cc = cc.to(self.device)
+            bchw = z.shape
+            if ck != self.masked_image_key:
+                cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
+            else:
+                cc = self.get_first_stage_encoding(self.encode_first_stage(cc))
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        all_conds = {'c_concat': [c_cat], 'c_crossattn': [c]}
+        if return_first_stage_outputs:
+            return z, all_conds, x, xrec, xc
+        return z, all_conds
+
+    @torch.no_grad()
+    def log_images(self, *args, **kwargs):
+        log = super(LatentInpaintDiffusion, self).log_images(*args, **kwargs)
+        log['masked_image'] = rearrange(
+            args[0]['masked_image'], 'b h w c -> b c h w').to(
+                memory_format=torch.contiguous_format).float()
+        return log
+
+
+class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion):
+    """
+    condition on monocular depth estimation
+    """
+
+    def __init__(self,
+                 depth_stage_config,
+                 concat_keys=('midas_in', ),
+                 *args,
+                 **kwargs):
+        super().__init__(concat_keys=concat_keys, *args, **kwargs)
+        self.depth_model = instantiate_from_config(depth_stage_config)
+        self.depth_stage_key = concat_keys[0]
+
+    @torch.no_grad()
+    def get_input(self,
+                  batch,
+                  k,
+                  cond_key=None,
+                  bs=None,
+                  return_first_stage_outputs=False):
+        # note: restricted to non-trainable encoders currently
+        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for depth2img'
+        z, c, x, xrec, xc = super().get_input(
+            batch,
+            self.first_stage_key,
+            return_first_stage_outputs=True,
+            force_c_encode=True,
+            return_original_cond=True,
+            bs=bs)
+
+        assert exists(self.concat_keys)
+        assert len(self.concat_keys) == 1
+        c_cat = list()
+        for ck in self.concat_keys:
+            cc = batch[ck]
+            if bs is not None:
+                cc = cc[:bs]
+                cc = cc.to(self.device)
+            cc = self.depth_model(cc)
+            cc = torch.nn.functional.interpolate(
+                cc,
+                size=z.shape[2:],
+                mode='bicubic',
+                align_corners=False,
+            )
+
+            depth_min, depth_max = torch.amin(
+                cc, dim=[1, 2, 3], keepdim=True), torch.amax(
+                    cc, dim=[1, 2, 3], keepdim=True)
+            cc = 2. * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        all_conds = {'c_concat': [c_cat], 'c_crossattn': [c]}
+        if return_first_stage_outputs:
+            return z, all_conds, x, xrec, xc
+        return z, all_conds
+
+    @torch.no_grad()
+    def log_images(self, *args, **kwargs):
+        log = super().log_images(*args, **kwargs)
+        depth = self.depth_model(args[0][self.depth_stage_key])
+        depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \
+            torch.amax(depth, dim=[1, 2, 3], keepdim=True)
+        log['depth'] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1.
+        return log
+
+
+class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion):
+    """
+        condition on low-res image (and optionally on some spatial noise augmentation)
+    """
+
+    def __init__(self,
+                 concat_keys=('lr', ),
+                 reshuffle_patch_size=None,
+                 low_scale_config=None,
+                 low_scale_key=None,
+                 *args,
+                 **kwargs):
+        super().__init__(concat_keys=concat_keys, *args, **kwargs)
+        self.reshuffle_patch_size = reshuffle_patch_size
+        self.low_scale_model = None
+        if low_scale_config is not None:
+            print('Initializing a low-scale model')
+            assert exists(low_scale_key)
+            self.instantiate_low_stage(low_scale_config)
+            self.low_scale_key = low_scale_key
+
+    def instantiate_low_stage(self, config):
+        model = instantiate_from_config(config)
+        self.low_scale_model = model.eval()
+        self.low_scale_model.train = disabled_train
+        for param in self.low_scale_model.parameters():
+            param.requires_grad = False
+
+    @torch.no_grad()
+    def get_input(self,
+                  batch,
+                  k,
+                  cond_key=None,
+                  bs=None,
+                  return_first_stage_outputs=False):
+        # note: restricted to non-trainable encoders currently
+        assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for upscaling-ft'
+        z, c, x, xrec, xc = super().get_input(
+            batch,
+            self.first_stage_key,
+            return_first_stage_outputs=True,
+            force_c_encode=True,
+            return_original_cond=True,
+            bs=bs)
+
+        assert exists(self.concat_keys)
+        assert len(self.concat_keys) == 1
+        # optionally make spatial noise_level here
+        c_cat = list()
+        noise_level = None
+        for ck in self.concat_keys:
+            cc = batch[ck]
+            cc = rearrange(cc, 'b h w c -> b c h w')
+            if exists(self.reshuffle_patch_size):
+                assert isinstance(self.reshuffle_patch_size, int)
+                cc = rearrange(
+                    cc,
+                    'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
+                    p1=self.reshuffle_patch_size,
+                    p2=self.reshuffle_patch_size)
+            if bs is not None:
+                cc = cc[:bs]
+                cc = cc.to(self.device)
+            if exists(self.low_scale_model) and ck == self.low_scale_key:
+                cc, noise_level = self.low_scale_model(cc)
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        if exists(noise_level):
+            all_conds = {
+                'c_concat': [c_cat],
+                'c_crossattn': [c],
+                'c_adm': noise_level
+            }
+        else:
+            all_conds = {'c_concat': [c_cat], 'c_crossattn': [c]}
+        if return_first_stage_outputs:
+            return z, all_conds, x, xrec, xc
+        return z, all_conds
+
+    @torch.no_grad()
+    def log_images(self, *args, **kwargs):
+        log = super().log_images(*args, **kwargs)
+        log['lr'] = rearrange(args[0]['lr'], 'b h w c -> b c h w')
+        return log
diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/plms.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/plms.py
new file mode 100644
index 000000000..f92d5feb0
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/plms.py
@@ -0,0 +1,328 @@
+"""SAMPLING ONLY."""
+
+from functools import partial
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from ....ldm.models.diffusion.sampling_util import norm_thresholding
+from ....ldm.modules.diffusionmodules.util import (
+    make_ddim_sampling_parameters, make_ddim_timesteps, noise_like)
+
+
+class PLMSSampler(object):
+
+    def __init__(self, model, schedule='linear', **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device('cuda'):
+                attr = attr.to(torch.device('cuda'))
+        setattr(self, name, attr)
+
+    def make_schedule(self,
+                      ddim_num_steps,
+                      ddim_discretize='uniform',
+                      ddim_eta=0.,
+                      verbose=True):
+        if ddim_eta != 0:
+            raise ValueError('ddim_eta must be 0 for PLMS')
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[
+            0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+
+        def to_torch(x):
+            return x.clone().detach().to(torch.float32).to(self.model.device)
+
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(self.model.alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(
+            alphacums=alphas_cumprod.cpu(),
+            ddim_timesteps=self.ddim_timesteps,
+            eta=ddim_eta,
+            verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas',
+                             np.sqrt(1. - ddim_alphas))
+        tmp1 = (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod)
+        tmp2 = (1 - self.alphas_cumprod / self.alphas_cumprod_prev)
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(tmp1 * tmp2)
+        self.register_buffer('ddim_sigmas_for_original_num_steps',
+                             sigmas_for_original_sampling_steps)
+
+    @torch.no_grad()
+    def sample(
+            self,
+            S,
+            batch_size,
+            shape,
+            conditioning=None,
+            callback=None,
+            normals_sequence=None,
+            img_callback=None,
+            quantize_x0=False,
+            eta=0.,
+            mask=None,
+            x0=None,
+            temperature=1.,
+            noise_dropout=0.,
+            score_corrector=None,
+            corrector_kwargs=None,
+            verbose=True,
+            x_T=None,
+            log_every_t=100,
+            unconditional_guidance_scale=1.,
+            unconditional_conditioning=None,
+            # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+            dynamic_threshold=None,
+            **kwargs):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(
+                        f'Warning: Got {cbs} conditionings but batch-size is {batch_size}'
+                    )
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(
+                        f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}'
+                    )
+
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for PLMS sampling is {size}')
+
+        samples, intermediates = self.plms_sampling(
+            conditioning,
+            size,
+            callback=callback,
+            img_callback=img_callback,
+            quantize_denoised=quantize_x0,
+            mask=mask,
+            x0=x0,
+            ddim_use_original_steps=False,
+            noise_dropout=noise_dropout,
+            temperature=temperature,
+            score_corrector=score_corrector,
+            corrector_kwargs=corrector_kwargs,
+            x_T=x_T,
+            log_every_t=log_every_t,
+            unconditional_guidance_scale=unconditional_guidance_scale,
+            unconditional_conditioning=unconditional_conditioning,
+            dynamic_threshold=dynamic_threshold,
+        )
+        return samples, intermediates
+
+    @torch.no_grad()
+    def plms_sampling(self,
+                      cond,
+                      shape,
+                      x_T=None,
+                      ddim_use_original_steps=False,
+                      callback=None,
+                      timesteps=None,
+                      quantize_denoised=False,
+                      mask=None,
+                      x0=None,
+                      img_callback=None,
+                      log_every_t=100,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      dynamic_threshold=None):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(
+                min(timesteps / self.ddim_timesteps.shape[0], 1)
+                * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = list(reversed(range(
+            0, timesteps))) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[
+            0]
+        print(f'Running PLMS Sampling with {total_steps} timesteps')
+
+        iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps)
+        old_eps = []
+
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b, ), step, device=device, dtype=torch.long)
+            ts_next = torch.full((b, ),
+                                 time_range[min(i + 1,
+                                                len(time_range) - 1)],
+                                 device=device,
+                                 dtype=torch.long)
+
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(
+                    x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+
+            outs = self.p_sample_plms(
+                img,
+                cond,
+                ts,
+                index=index,
+                use_original_steps=ddim_use_original_steps,
+                quantize_denoised=quantize_denoised,
+                temperature=temperature,
+                noise_dropout=noise_dropout,
+                score_corrector=score_corrector,
+                corrector_kwargs=corrector_kwargs,
+                unconditional_guidance_scale=unconditional_guidance_scale,
+                unconditional_conditioning=unconditional_conditioning,
+                old_eps=old_eps,
+                t_next=ts_next,
+                dynamic_threshold=dynamic_threshold)
+            img, pred_x0, e_t = outs
+            old_eps.append(e_t)
+            if len(old_eps) >= 4:
+                old_eps.pop(0)
+            if callback:
+                callback(i)
+            if img_callback:
+                img_callback(pred_x0, i)
+
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+
+        return img, intermediates
+
+    @torch.no_grad()
+    def p_sample_plms(self,
+                      x,
+                      c,
+                      t,
+                      index,
+                      repeat_noise=False,
+                      use_original_steps=False,
+                      quantize_denoised=False,
+                      temperature=1.,
+                      noise_dropout=0.,
+                      score_corrector=None,
+                      corrector_kwargs=None,
+                      unconditional_guidance_scale=1.,
+                      unconditional_conditioning=None,
+                      old_eps=None,
+                      t_next=None,
+                      dynamic_threshold=None):
+        b, *_, device = *x.shape, x.device
+
+        def get_model_output(x, t):
+            if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+                e_t = self.model.apply_model(x, t, c)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t] * 2)
+                c_in = torch.cat([unconditional_conditioning, c])
+                e_t_uncond, e_t = self.model.apply_model(x_in, t_in,
+                                                         c_in).chunk(2)
+                e_t = e_t_uncond + unconditional_guidance_scale * (
+                    e_t - e_t_uncond)
+
+            if score_corrector is not None:
+                assert self.model.parameterization == 'eps'
+                e_t = score_corrector.modify_score(self.model, e_t, x, t, c,
+                                                   **corrector_kwargs)
+
+            return e_t
+
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod \
+            if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+
+        def get_x_prev_and_pred_x0(e_t, index):
+            # select parameters corresponding to the currently considered timestep
+            a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+            a_prev = torch.full((b, 1, 1, 1),
+                                alphas_prev[index],
+                                device=device)
+            sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+            sqrt_one_minus_at = torch.full((b, 1, 1, 1),
+                                           sqrt_one_minus_alphas[index],
+                                           device=device)
+
+            # current prediction for x_0
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+            if quantize_denoised:
+                pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+            if dynamic_threshold is not None:
+                pred_x0 = norm_thresholding(pred_x0, dynamic_threshold)
+            # direction pointing to x_t
+            dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
+            noise = sigma_t * noise_like(x.shape, device,
+                                         repeat_noise) * temperature
+            if noise_dropout > 0.:
+                noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+            x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+            return x_prev, pred_x0
+
+        e_t = get_model_output(x, t)
+        if len(old_eps) == 0:
+            # Pseudo Improved Euler (2nd order)
+            x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index)
+            e_t_next = get_model_output(x_prev, t_next)
+            e_t_prime = (e_t + e_t_next) / 2
+        elif len(old_eps) == 1:
+            # 2nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (3 * e_t - old_eps[-1]) / 2
+        elif len(old_eps) == 2:
+            # 3nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12
+        elif len(old_eps) >= 3:
+            # 4nd order Pseudo Linear Multistep (Adams-Bashforth)
+            e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2]
+                         - 9 * old_eps[-3]) / 24
+
+        x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index)
+
+        return x_prev, pred_x0, e_t
diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/sampling_util.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/sampling_util.py
new file mode 100644
index 000000000..52cfabed8
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/sampling_util.py
@@ -0,0 +1,25 @@
+import numpy as np
+import torch
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions.
+    From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py"""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f'input has {x.ndim} dims but target_dims is {target_dims}, which is less'
+        )
+    return x[(..., ) + (None, ) * dims_to_append]
+
+
+def norm_thresholding(x0, value):
+    s = append_dims(
+        x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim)
+    return x0 * (value / s)
+
+
+def spatial_norm_thresholding(x0, value):
+    # b c h w
+    s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value)
+    return x0 * (value / s)
diff --git a/modelscope/models/cv/anydoor/ldm/modules/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/modules/attention.py b/modelscope/models/cv/anydoor/ldm/modules/attention.py
new file mode 100644
index 000000000..37921b866
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/attention.py
@@ -0,0 +1,367 @@
+import math
+# CrossAttn precision handling
+import os
+from inspect import isfunction
+from typing import Any, Optional
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import einsum, nn
+
+from ...ldm.modules.diffusionmodules.util import checkpoint
+
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILABLE = True
+except Exception:
+    XFORMERS_IS_AVAILABLE = False
+
+_ATTN_PRECISION = os.environ.get('ATTN_PRECISION', 'fp32')
+
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+# feedforward
+class GEGLU(nn.Module):
+
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(nn.Linear(
+            dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(project_in, nn.Dropout(dropout),
+                                 nn.Linear(inner_dim, dim_out))
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class SpatialSelfAttention(nn.Module):
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(self,
+                 query_dim,
+                 context_dim=None,
+                 heads=8,
+                 dim_head=64,
+                 dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
+                      (q, k, v))
+
+        # force cast to fp32 to avoid overflowing
+        if _ATTN_PRECISION == 'fp32':
+            with torch.autocast(enabled=False, device_type='cuda'):
+                q, k = q.float(), k.float()
+                sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+        else:
+            sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+
+        del q, k
+
+        if exists(mask):
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+
+        # attention, what we cannot get enough of
+        sim = sim.softmax(dim=-1)
+
+        out = einsum('b i j, b j d -> b i d', sim, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(out)
+
+
+class MemoryEfficientCrossAttention(nn.Module):
+    # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223
+    def __init__(self,
+                 query_dim,
+                 context_dim=None,
+                 heads=8,
+                 dim_head=64,
+                 dropout=0.0):
+        super().__init__()
+        print(
+            f'Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using '
+            f'{heads} heads.')
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+        self.attention_op: Optional[Any] = None
+
+    def forward(self, x, context=None, mask=None):
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        b, _, _ = q.shape
+        q, k, v = map(
+            lambda t: t.unsqueeze(3).reshape(b, t.shape[
+                1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape(
+                    b * self.heads, t.shape[1], self.dim_head).contiguous(),
+            (q, k, v),
+        )
+
+        # actually compute the attention, what we cannot get enough of
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op)
+
+        if exists(mask):
+            raise NotImplementedError
+        out = (
+            out.unsqueeze(0).reshape(
+                b, self.heads, out.shape[1],
+                self.dim_head).permute(0, 2, 1,
+                                       3).reshape(b, out.shape[1],
+                                                  self.heads * self.dim_head))
+        return self.to_out(out)
+
+
+class BasicTransformerBlock(nn.Module):
+    ATTENTION_MODES = {
+        'softmax': CrossAttention,  # vanilla attention
+        'softmax-xformers': MemoryEfficientCrossAttention
+    }
+
+    def __init__(self,
+                 dim,
+                 n_heads,
+                 d_head,
+                 dropout=0.,
+                 context_dim=None,
+                 gated_ff=True,
+                 checkpoint=True,
+                 disable_self_attn=False):
+        super().__init__()
+        attn_mode = 'softmax-xformers' if XFORMERS_IS_AVAILABLE else 'softmax'
+        assert attn_mode in self.ATTENTION_MODES
+        attn_cls = self.ATTENTION_MODES[attn_mode]
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = attn_cls(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else
+            None)  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = attn_cls(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout)  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(),
+                          self.checkpoint)
+
+    def _forward(self, x, context=None):
+        x = self.attn1(
+            self.norm1(x),
+            context=context if self.disable_self_attn else None) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    NEW: use_linear for more efficiency instead of the 1x1 convs
+    """
+
+    def __init__(self,
+                 in_channels,
+                 n_heads,
+                 d_head,
+                 depth=1,
+                 dropout=0.,
+                 context_dim=None,
+                 disable_self_attn=False,
+                 use_linear=False,
+                 use_checkpoint=True):
+        super().__init__()
+        if exists(context_dim) and not isinstance(context_dim, list):
+            context_dim = [context_dim]
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+        if not use_linear:
+            self.proj_in = nn.Conv2d(
+                in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        else:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim[d],
+                disable_self_attn=disable_self_attn,
+                checkpoint=use_checkpoint) for d in range(depth)
+        ])
+        if not use_linear:
+            self.proj_out = zero_module(
+                nn.Conv2d(
+                    inner_dim, in_channels, kernel_size=1, stride=1,
+                    padding=0))
+        else:
+            self.proj_out = zero_module(nn.Linear(in_channels, inner_dim))
+        self.use_linear = use_linear
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        if not isinstance(context, list):
+            context = [context]
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        if self.use_linear:
+            x = self.proj_in(x)
+        for i, block in enumerate(self.transformer_blocks):
+            x = block(x, context=context[i])
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        if not self.use_linear:
+            x = self.proj_out(x)
+        return x + x_in
diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/model.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/model.py
new file mode 100644
index 000000000..77b2f3826
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/model.py
@@ -0,0 +1,966 @@
+# pytorch_diffusion + derived encoder decoder
+import math
+from typing import Any, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from ....ldm.modules.attention import MemoryEfficientCrossAttention
+
+try:
+    import xformers
+    import xformers.ops
+    XFORMERS_IS_AVAILABLE = True
+except Exception:
+    XFORMERS_IS_AVAILABLE = False
+    print("No module 'xformers'. Proceeding without it.")
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(
+            x, scale_factor=2.0, mode='nearest')
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode='constant', value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class ResnetBlock(nn.Module):
+
+    def __init__(self,
+                 *,
+                 in_channels,
+                 out_channels=None,
+                 conv_shortcut=False,
+                 dropout,
+                 temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class AttnBlock(nn.Module):
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(
+            v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+class MemoryEfficientAttnBlock(nn.Module):
+    """
+        Uses xformers efficient implementation,
+        Note: this is a single-head self-attention operation
+    """
+
+    #
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.attention_op: Optional[Any] = None
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        B, C, H, W = q.shape
+        q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'),
+                      (q, k, v))
+
+        q, k, v = map(
+            lambda t: t.unsqueeze(3).reshape(B, t.shape[1], 1, C).permute(
+                0, 2, 1, 3).reshape(B * 1, t.shape[1], C).contiguous(),
+            (q, k, v),
+        )
+        out = xformers.ops.memory_efficient_attention(
+            q, k, v, attn_bias=None, op=self.attention_op)
+
+        out = (
+            out.unsqueeze(0).reshape(B, 1, out.shape[1],
+                                     C).permute(0, 2, 1,
+                                                3).reshape(B, out.shape[1], C))
+        out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C)
+        out = self.proj_out(out)
+        return x + out
+
+
+class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention):
+
+    def forward(self, x, context=None, mask=None):
+        b, c, h, w = x.shape
+        x = rearrange(x, 'b c h w -> b (h w) c')
+        out = super().forward(x, context=context, mask=mask)
+        out = rearrange(out, 'b (h w) c -> b c h w', h=h, w=w, c=c)
+        return x + out
+
+
+def make_attn(in_channels, attn_type='vanilla', attn_kwargs=None):
+    assert attn_type in [
+        'vanilla', 'vanilla-xformers', 'memory-efficient-cross-attn', 'linear',
+        'none'
+    ], f'attn_type {attn_type} unknown'
+    if XFORMERS_IS_AVAILABLE and attn_type == 'vanilla':
+        attn_type = 'vanilla-xformers'
+    print(
+        f"making attention of type '{attn_type}' with {in_channels} in_channels"
+    )
+    if attn_type == 'vanilla':
+        assert attn_kwargs is None
+        return AttnBlock(in_channels)
+    elif attn_type == 'vanilla-xformers':
+        print(
+            f'building MemoryEfficientAttnBlock with {in_channels} in_channels...'
+        )
+        return MemoryEfficientAttnBlock(in_channels)
+    elif type == 'memory-efficient-cross-attn':
+        attn_kwargs['query_dim'] = in_channels
+        return MemoryEfficientCrossAttentionWrapper(**attn_kwargs)
+    elif attn_type == 'none':
+        return nn.Identity(in_channels)
+    else:
+        raise NotImplementedError()
+
+
+class Model(nn.Module):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 use_timestep=True,
+                 use_linear_attn=False,
+                 attn_type='vanilla'):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = 'linear'
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch, self.temb_ch),
+                torch.nn.Linear(self.temb_ch, self.temb_ch),
+            ])
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1, ) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in + skip_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, t=None, context=None):
+        # assert x.shape[2] == x.shape[3] == self.resolution
+        if context is not None:
+            # assume aligned context, cat along channel axis
+            x = torch.cat((x, context), dim=1)
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()],
+                                                              dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+    def get_last_layer(self):
+        return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 z_channels,
+                 double_z=True,
+                 use_linear_attn=False,
+                 attn_type='vanilla',
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = 'linear'
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1, ) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 z_channels,
+                 give_pre_end=False,
+                 tanh_out=False,
+                 use_linear_attn=False,
+                 attn_type='vanilla',
+                 **ignorekwargs):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = 'linear'
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2**(self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print('Working with z of shape {} = {} dimensions.'.format(
+            self.z_shape, np.prod(self.z_shape)))
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+
+
+class SimpleDecoder(nn.Module):
+
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([
+            nn.Conv2d(in_channels, in_channels, 1),
+            ResnetBlock(
+                in_channels=in_channels,
+                out_channels=2 * in_channels,
+                temb_channels=0,
+                dropout=0.0),
+            ResnetBlock(
+                in_channels=2 * in_channels,
+                out_channels=4 * in_channels,
+                temb_channels=0,
+                dropout=0.0),
+            ResnetBlock(
+                in_channels=4 * in_channels,
+                out_channels=2 * in_channels,
+                temb_channels=0,
+                dropout=0.0),
+            nn.Conv2d(2 * in_channels, in_channels, 1),
+            Upsample(in_channels, with_conv=True)
+        ])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1, 2, 3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+
+
+class UpsampleDecoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ch,
+                 num_res_blocks,
+                 resolution,
+                 ch_mult=(2, 2),
+                 dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2**(self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class LatentRescaler(nn.Module):
+
+    def __init__(self,
+                 factor,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 depth=2):
+        super().__init__()
+        # residual block, interpolate, residual block
+        self.factor = factor
+        self.conv_in = nn.Conv2d(
+            in_channels, mid_channels, kernel_size=3, stride=1, padding=1)
+        self.res_block1 = nn.ModuleList([
+            ResnetBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                temb_channels=0,
+                dropout=0.0) for _ in range(depth)
+        ])
+        self.attn = AttnBlock(mid_channels)
+        self.res_block2 = nn.ModuleList([
+            ResnetBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                temb_channels=0,
+                dropout=0.0) for _ in range(depth)
+        ])
+
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        for block in self.res_block1:
+            x = block(x, None)
+        x = torch.nn.functional.interpolate(
+            x,
+            size=(int(round(x.shape[2] * self.factor)),
+                  int(round(x.shape[3] * self.factor))))
+        x = self.attn(x)
+        for block in self.res_block2:
+            x = block(x, None)
+        x = self.conv_out(x)
+        return x
+
+
+class MergedRescaleEncoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 ch,
+                 resolution,
+                 out_ch,
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 ch_mult=(1, 2, 4, 8),
+                 rescale_factor=1.0,
+                 rescale_module_depth=1):
+        super().__init__()
+        intermediate_chn = ch * ch_mult[-1]
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            num_res_blocks=num_res_blocks,
+            ch=ch,
+            ch_mult=ch_mult,
+            z_channels=intermediate_chn,
+            double_z=False,
+            resolution=resolution,
+            attn_resolutions=attn_resolutions,
+            dropout=dropout,
+            resamp_with_conv=resamp_with_conv,
+            out_ch=None)
+        self.rescaler = LatentRescaler(
+            factor=rescale_factor,
+            in_channels=intermediate_chn,
+            mid_channels=intermediate_chn,
+            out_channels=out_ch,
+            depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.rescaler(x)
+        return x
+
+
+class MergedRescaleDecoder(nn.Module):
+
+    def __init__(self,
+                 z_channels,
+                 out_ch,
+                 resolution,
+                 num_res_blocks,
+                 attn_resolutions,
+                 ch,
+                 ch_mult=(1, 2, 4, 8),
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 rescale_factor=1.0,
+                 rescale_module_depth=1):
+        super().__init__()
+        tmp_chn = z_channels * ch_mult[-1]
+        self.decoder = Decoder(
+            out_ch=out_ch,
+            z_channels=tmp_chn,
+            attn_resolutions=attn_resolutions,
+            dropout=dropout,
+            resamp_with_conv=resamp_with_conv,
+            in_channels=None,
+            num_res_blocks=num_res_blocks,
+            ch_mult=ch_mult,
+            resolution=resolution,
+            ch=ch)
+        self.rescaler = LatentRescaler(
+            factor=rescale_factor,
+            in_channels=z_channels,
+            mid_channels=tmp_chn,
+            out_channels=tmp_chn,
+            depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Upsampler(nn.Module):
+
+    def __init__(self,
+                 in_size,
+                 out_size,
+                 in_channels,
+                 out_channels,
+                 ch_mult=2):
+        super().__init__()
+        assert out_size >= in_size
+        num_blocks = int(np.log2(out_size // in_size)) + 1
+        factor_up = 1. + (out_size % in_size)
+        print(
+            f'Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}'
+        )
+        self.rescaler = LatentRescaler(
+            factor=factor_up,
+            in_channels=in_channels,
+            mid_channels=2 * in_channels,
+            out_channels=in_channels)
+        self.decoder = Decoder(
+            out_ch=out_channels,
+            resolution=out_size,
+            z_channels=in_channels,
+            num_res_blocks=2,
+            attn_resolutions=[],
+            in_channels=None,
+            ch=in_channels,
+            ch_mult=[ch_mult for _ in range(num_blocks)])
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Resize(nn.Module):
+
+    def __init__(self, in_channels=None, learned=False, mode='bilinear'):
+        super().__init__()
+        self.with_conv = learned
+        self.mode = mode
+        if self.with_conv:
+            print(
+                f'Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode'
+            )
+            raise NotImplementedError()
+            assert in_channels is not None
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=4, stride=2, padding=1)
+
+    def forward(self, x, scale_factor=1.0):
+        if scale_factor == 1.0:
+            return x
+        else:
+            x = torch.nn.functional.interpolate(
+                x,
+                mode=self.mode,
+                align_corners=False,
+                scale_factor=scale_factor)
+        return x
diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/openaimodel.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/openaimodel.py
new file mode 100644
index 000000000..afe1b864b
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/openaimodel.py
@@ -0,0 +1,820 @@
+import math
+from abc import abstractmethod
+
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ....ldm.modules.attention import SpatialTransformer
+from ....ldm.modules.diffusionmodules.util import (avg_pool_nd, checkpoint,
+                                                   conv_nd, linear,
+                                                   normalization,
+                                                   timestep_embedding,
+                                                   zero_module)
+from ....ldm.util import exists
+
+
+# dummy replace
+def convert_module_to_f16(x):
+    pass
+
+
+def convert_module_to_f32(x):
+    pass
+
+
+# go
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb, context=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self,
+                 channels,
+                 use_conv,
+                 dims=2,
+                 out_channels=None,
+                 padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(
+                dims, self.channels, self.out_channels, 3, padding=padding)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2),
+                mode='nearest')
+        else:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class TransposedUpsample(nn.Module):
+    'Learned 2x upsampling without padding'
+
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+
+        self.up = nn.ConvTranspose2d(
+            self.channels, self.out_channels, kernel_size=ks, stride=2)
+
+    def forward(self, x):
+        return self.up(x)
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self,
+                 channels,
+                 use_conv,
+                 dims=2,
+                 out_channels=None,
+                 padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims,
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels
+                if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(
+                    dims, self.out_channels, self.out_channels, 3, padding=1)),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels,
+                                           1)
+
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(self._forward, (x, emb), self.parameters(),
+                          self.use_checkpoint)
+
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f'q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}'
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x):
+        return checkpoint(
+            self._forward, (x, ), self.parameters(), True
+        )  # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        # return pt_checkpoint(self._forward, x)  # pytorch
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial**2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(
+            ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            'bct,bcs->bts', q * scale,
+            k * scale)  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum('bts,bcs->bct', weight, v)
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            'bct,bcs->bts',
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum('bts,bcs->bct', weight,
+                      v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        in_channels,
+        model_channels,
+        out_channels,
+        num_res_blocks,
+        attention_resolutions,
+        dropout=0,
+        channel_mult=(1, 2, 4, 8),
+        conv_resample=True,
+        dims=2,
+        num_classes=None,
+        use_checkpoint=False,
+        use_fp16=False,
+        num_heads=-1,
+        num_head_channels=-1,
+        num_heads_upsample=-1,
+        use_scale_shift_norm=False,
+        resblock_updown=False,
+        use_new_attention_order=False,
+        use_spatial_transformer=False,  # custom transformer support
+        transformer_depth=1,  # custom transformer support
+        context_dim=None,  # custom transformer support
+        n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+        legacy=True,
+        disable_self_attentions=None,
+        num_attention_blocks=None,
+        disable_middle_self_attn=False,
+        use_linear_in_transformer=False,
+    ):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None
+
+        if context_dim is not None:
+            assert use_spatial_transformer
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    'provide num_res_blocks either as an int (globally constant) or '
+                    'as a list/tuple (per-level) with the same length as channel_mult'
+                )
+            self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i
+                                                                             ],
+                    range(len(num_attention_blocks))))
+            print(
+                f'Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. '
+                f'This option has LESS priority than attention_resolutions {attention_resolutions}, '
+                f'i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, '
+                f'attention will still not be set.')
+
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == 'continuous':
+                print('setting up linear c_adm embedding layer')
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            else:
+                raise ValueError()
+
+        self.input_blocks = nn.ModuleList([
+            TimestepEmbedSequential(
+                conv_nd(dims, in_channels, model_channels, 3, padding=1))
+        ])
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks
+                                  ) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth,
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint))
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        ) if resblock_updown else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else
+            SpatialTransformer(  # always uses a self-attn
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth,
+                context_dim=context_dim,
+                disable_self_attn=disable_middle_self_attn,
+                use_linear=use_linear_in_transformer,
+                use_checkpoint=use_checkpoint),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(self.num_res_blocks[level] + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks
+                                  ) or i < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads_upsample,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth,
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa,
+                                use_linear=use_linear_in_transformer,
+                                use_checkpoint=use_checkpoint))
+                if level and i == self.num_res_blocks[level]:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        ) if resblock_updown else Upsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch))
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(
+                conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = nn.Sequential(
+                normalization(ch),
+                conv_nd(dims, model_channels, n_embed, 1),
+                # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+            )
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), 'must specify y if and only if the model is class-conditional'
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        if self.predict_codebook_ids:
+            return self.id_predictor(h)
+        else:
+            return self.out(h)
diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/upscaling.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/upscaling.py
new file mode 100644
index 000000000..bcc9d138f
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/upscaling.py
@@ -0,0 +1,103 @@
+from functools import partial
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ....ldm.modules.diffusionmodules.util import (extract_into_tensor,
+                                                   make_beta_schedule)
+from ....ldm.util import default
+
+
+class AbstractLowScaleModel(nn.Module):
+    # for concatenating a downsampled image to the latent representation
+    def __init__(self, noise_schedule_config=None):
+        super(AbstractLowScaleModel, self).__init__()
+        if noise_schedule_config is not None:
+            self.register_schedule(**noise_schedule_config)
+
+    def register_schedule(self,
+                          beta_schedule='linear',
+                          timesteps=1000,
+                          linear_start=1e-4,
+                          linear_end=2e-2,
+                          cosine_s=8e-3):
+        betas = make_beta_schedule(
+            beta_schedule,
+            timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[
+            0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape)
+                * x_start
+                + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t,
+                                      x_start.shape) * noise)
+
+    def forward(self, x):
+        return x, None
+
+    def decode(self, x):
+        return x
+
+
+class SimpleImageConcat(AbstractLowScaleModel):
+    # no noise level conditioning
+    def __init__(self):
+        super(SimpleImageConcat, self).__init__(noise_schedule_config=None)
+        self.max_noise_level = 0
+
+    def forward(self, x):
+        # fix to constant noise level
+        return x, torch.zeros(x.shape[0], device=x.device).long()
+
+
+class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel):
+
+    def __init__(self,
+                 noise_schedule_config,
+                 max_noise_level=1000,
+                 to_cuda=False):
+        super().__init__(noise_schedule_config=noise_schedule_config)
+        self.max_noise_level = max_noise_level
+
+    def forward(self, x, noise_level=None):
+        if noise_level is None:
+            noise_level = torch.randint(
+                0, self.max_noise_level, (x.shape[0], ),
+                device=x.device).long()
+        else:
+            assert isinstance(noise_level, torch.Tensor)
+        z = self.q_sample(x, noise_level)
+        return z, noise_level
diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/util.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/util.py
new file mode 100644
index 000000000..d48ea5f52
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/util.py
@@ -0,0 +1,310 @@
+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+
+import math
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import repeat
+
+from ....ldm.util import instantiate_from_config
+
+
+def make_beta_schedule(schedule,
+                       n_timestep,
+                       linear_start=1e-4,
+                       linear_end=2e-2,
+                       cosine_s=8e-3):
+    if schedule == 'linear':
+        betas = (
+            torch.linspace(
+                linear_start**0.5,
+                linear_end**0.5,
+                n_timestep,
+                dtype=torch.float64)**2)
+
+    elif schedule == 'cosine':
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep
+            + cosine_s)
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+
+    elif schedule == 'sqrt_linear':
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == 'sqrt':
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64)**0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+
+
+def make_ddim_timesteps(ddim_discr_method,
+                        num_ddim_timesteps,
+                        num_ddpm_timesteps,
+                        verbose=True):
+    if ddim_discr_method == 'uniform':
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == 'quad':
+        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8),
+                                       num_ddim_timesteps))**2).astype(int)
+    else:
+        raise NotImplementedError(
+            f'There is no ddim discretization method called "{ddim_discr_method}"'
+        )
+
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f'Selected timesteps for ddim sampler: {steps_out}')
+    return steps_out
+
+
+def make_ddim_sampling_parameters(alphacums,
+                                  ddim_timesteps,
+                                  eta,
+                                  verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]]
+                             + alphacums[ddim_timesteps[:-1]].tolist())
+
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    tmp = (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)
+    sigmas = eta * np.sqrt(tmp)
+    if verbose:
+        print(
+            f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}'
+        )
+        print(
+            f'For the chosen value of eta, which is {eta}, '
+            f'this results in the following sigma_t schedule for ddim sampler {sigmas}'
+        )
+    return sigmas, alphas, alphas_prev
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1, ) * (len(x_shape) - 1)))
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        ctx.gpu_autocast_kwargs = {
+            'enabled': torch.is_autocast_enabled(),
+            'dtype': torch.get_autocast_gpu_dtype(),
+            'cache_enabled': torch.is_autocast_cache_enabled()
+        }
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [
+            x.detach().requires_grad_(True) for x in ctx.input_tensors
+        ]
+        with torch.enable_grad(), \
+                torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f'unsupported dimensions: {dims}')
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f'unsupported dimensions: {dims}')
+
+
+class HybridConditioner(nn.Module):
+
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(
+            c_crossattn_config)
+
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
+
+
+def noise_like(shape, device, repeat=False):
+
+    def repeat_noise():
+        torch.randn(
+            (1, *shape[1:]), device=device).repeat(shape[0],
+                                                   *((1, ) * (len(shape) - 1)))
+
+    noise = lambda: torch.randn(shape, device=device)  # noqa
+    return repeat_noise() if repeat else noise()
diff --git a/modelscope/models/cv/anydoor/ldm/modules/distributions/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/distributions/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/modules/distributions/distributions.py b/modelscope/models/cv/anydoor/ldm/modules/distributions/distributions.py
new file mode 100644
index 000000000..dd094d532
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/distributions/distributions.py
@@ -0,0 +1,93 @@
+import numpy as np
+import torch
+
+
+class AbstractDistribution:
+
+    def sample(self):
+        raise NotImplementedError()
+
+    def mode(self):
+        raise NotImplementedError()
+
+
+class DiracDistribution(AbstractDistribution):
+
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution(object):
+
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean).to(device=self.parameters.device)
+
+    def sample(self):
+        x = self.mean + self.std * torch.randn(
+            self.mean.shape).to(device=self.parameters.device)
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar
+            + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+
+    def mode(self):
+        return self.mean
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, 'at least one argument must be a Tensor'
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    tmp = ((mean1 - mean2)**2) * torch.exp(-logvar2)
+    return 0.5 * (-1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2)
+                  + tmp)
diff --git a/modelscope/models/cv/anydoor/ldm/modules/ema.py b/modelscope/models/cv/anydoor/ldm/modules/ema.py
new file mode 100644
index 000000000..a1167fe70
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/ema.py
@@ -0,0 +1,87 @@
+import torch
+from torch import nn
+
+
+class LitEma(nn.Module):
+
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError('Decay must be between 0 and 1')
+
+        self.m_name2s_name = {}
+        self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer(
+            'num_updates',
+            torch.tensor(0, dtype=torch.int)
+            if use_num_upates else torch.tensor(-1, dtype=torch.int))
+
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace('.', '')
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
+
+        self.collected_params = []
+
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int))
+
+    def forward(self, model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            tmp = (1 + self.num_updates) / (10 + self.num_updates)
+            decay = min(self.decay, tmp)
+
+        one_minus_decay = 1.0 - decay
+
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(
+                        m_param[key])
+                    tmp = shadow_params[sname] - m_param[key]
+                    shadow_params[sname].sub_(one_minus_decay * tmp)
+                else:
+                    assert key not in self.m_name2s_name
+
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(
+                    shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert key not in self.m_name2s_name
+
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)
diff --git a/modelscope/models/cv/anydoor/ldm/modules/encoders/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/encoders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/anydoor/ldm/modules/encoders/modules.py b/modelscope/models/cv/anydoor/ldm/modules/encoders/modules.py
new file mode 100644
index 000000000..bfbfb78ea
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/modules/encoders/modules.py
@@ -0,0 +1,371 @@
+import os
+
+import open_clip
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.checkpoint import checkpoint
+from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel,
+                          T5Tokenizer)
+
+from ....dinov2 import hubconf
+from ....ldm.util import count_params
+
+
+class LayerNormFp32(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back)."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(
+            x.to(torch.float32), self.normalized_shape, self.weight, self.bias,
+            self.eps)
+        return x.to(orig_type)
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm (with cast back to input dtype)."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias,
+                         self.eps)
+        return x.to(orig_type)
+
+
+class AbstractEncoder(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class IdentityEncoder(AbstractEncoder):
+
+    def encode(self, x):
+        return x
+
+
+class ClassEmbedder(nn.Module):
+
+    def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+        self.n_classes = n_classes
+        self.ucg_rate = ucg_rate
+
+    def forward(self, batch, key=None, disable_dropout=False):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        if self.ucg_rate > 0. and not disable_dropout:
+            mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate)
+            c = mask * c + (1 - mask) * torch.ones_like(c) * (
+                self.n_classes - 1)
+            c = c.long()
+        c = self.embedding(c)
+        return c
+
+    def get_unconditional_conditioning(self, bs, device='cuda'):
+        uc_class = self.n_classes - 1  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
+        uc = torch.ones((bs, ), device=device) * uc_class
+        uc = {self.key: uc}
+        return uc
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class FrozenT5Embedder(AbstractEncoder):
+    """Uses the T5 transformer encoder for text"""
+
+    def __init__(self,
+                 version='google/t5-v1_1-large',
+                 device='cuda',
+                 max_length=77,
+                 freeze=True
+                 ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length  # TODO: typical value?
+        if freeze:
+            self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt')
+        tokens = batch_encoding['input_ids'].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    LAYERS = ['last', 'pooled', 'hidden']
+
+    def __init__(self,
+                 version='openai/clip-vit-large-patch14',
+                 device='cuda',
+                 max_length=77,
+                 freeze=True,
+                 layer='last',
+                 layer_idx=None):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in self.LAYERS
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        if layer == 'hidden':
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt')
+        tokens = batch_encoding['input_ids'].to(self.device)
+        outputs = self.transformer(
+            input_ids=tokens, output_hidden_states=self.layer == 'hidden')
+        if self.layer == 'last':
+            z = outputs.last_hidden_state
+        elif self.layer == 'pooled':
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPEmbedder(AbstractEncoder):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    LAYERS = [
+        # "pooled",
+        'last',
+        'penultimate'
+    ]
+
+    def __init__(self,
+                 arch='ViT-H-14',
+                 version='laion2b_s32b_b79k',
+                 device='cuda',
+                 max_length=77,
+                 freeze=True,
+                 layer='last'):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch, device=torch.device('cpu'), pretrained=version)
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == 'last':
+            self.layer_idx = 0
+        elif self.layer == 'penultimate':
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        return z
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting(
+            ):
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPT5Encoder(AbstractEncoder):
+
+    def __init__(self,
+                 clip_version='openai/clip-vit-large-patch14',
+                 t5_version='google/t5-v1_1-xl',
+                 device='cuda',
+                 clip_max_length=77,
+                 t5_max_length=77):
+        super().__init__()
+        self.clip_encoder = FrozenCLIPEmbedder(
+            clip_version, device, max_length=clip_max_length)
+        self.t5_encoder = FrozenT5Embedder(
+            t5_version, device, max_length=t5_max_length)
+        print(
+            f'{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, '
+            f'{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.'
+        )
+
+    def encode(self, text):
+        return self(text)
+
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]
+
+
+class FrozenOpenCLIPImageEncoder(AbstractEncoder):
+    """
+    Uses the OpenCLIP transformer encoder for image
+    """
+
+    def __init__(self,
+                 arch='ViT-H-14',
+                 version='laion2b_s32b_b79k',
+                 device='cuda',
+                 freeze=True):
+        super().__init__()
+        model, _, preprocess = open_clip.create_model_and_transforms(
+            arch, device=torch.device('cpu'), pretrained=version)
+        del model.transformer
+        self.model = model
+        self.model.visual.output_tokens = True
+        self.device = device
+        if freeze:
+            self.freeze()
+        self.image_mean = torch.tensor(
+            [0.48145466, 0.4578275,
+             0.40821073]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+        self.image_std = torch.tensor(
+            [0.26862954, 0.26130258,
+             0.275777]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+        self.projector_token = nn.Linear(1280, 1024)
+        self.projector_embed = nn.Linear(1024, 1024)
+
+    def freeze(self):
+        self.model.visual.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+    def forward(self, image):
+        if isinstance(image, list):
+            image = torch.cat(image, 0)
+        image = (image.to(self.device) - self.image_mean.to(
+            self.device)) / self.image_std.to(self.device)
+        image_features, tokens = self.model.visual(image)
+        image_features = image_features.unsqueeze(1)
+        image_features = self.projector_embed(image_features)
+        tokens = self.projector_token(tokens)
+        hint = torch.cat([image_features, tokens], 1)
+        return hint
+
+    def encode(self, image):
+        return self(image)
+
+
+class FrozenDinoV2Encoder(AbstractEncoder):
+    """
+    Uses the DINOv2 encoder for image
+    """
+
+    def __init__(self, model_path, device='cuda', freeze=True):
+        DINOv2_weight_path = model_path
+
+        super().__init__()
+        dinov2 = hubconf.dinov2_vitg14()
+        state_dict = torch.load(DINOv2_weight_path)
+        dinov2.load_state_dict(state_dict, strict=False)
+        self.model = dinov2.to(device)
+        self.device = device
+        if freeze:
+            self.freeze()
+        self.image_mean = torch.tensor(
+            [0.485, 0.456, 0.406]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+        self.image_std = torch.tensor(
+            [0.229, 0.224, 0.225]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+        self.projector = nn.Linear(1536, 1024)
+
+    def freeze(self):
+        self.model.eval()
+        for param in self.model.parameters():
+            param.requires_grad = False
+
+    def forward(self, image):
+        if isinstance(image, list):
+            image = torch.cat(image, 0)
+
+        image = (image.to(self.device) - self.image_mean.to(
+            self.device)) / self.image_std.to(self.device)
+        features = self.model.forward_features(image)
+        tokens = features['x_norm_patchtokens']
+        image_features = features['x_norm_clstoken']
+        image_features = image_features.unsqueeze(1)
+        hint = torch.cat([image_features, tokens], 1)  # 8,257,1024
+        hint = self.projector(hint)
+        return hint
+
+    def encode(self, image):
+        return self(image)
diff --git a/modelscope/models/cv/anydoor/ldm/util.py b/modelscope/models/cv/anydoor/ldm/util.py
new file mode 100644
index 000000000..0a0c69e74
--- /dev/null
+++ b/modelscope/models/cv/anydoor/ldm/util.py
@@ -0,0 +1,221 @@
+import importlib
+from inspect import isfunction
+
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from torch import optim
+
+
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new('RGB', wh, color='white')
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('font/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = '\n'.join(xc[bi][start:start + nc]
+                          for start in range(0, len(xc[bi]), nc))
+
+        try:
+            draw.text((0, 0), lines, fill='black', font=font)
+        except UnicodeEncodeError:
+            print('Cant encode string for logging. Skipping.')
+
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+
+def exists(x):
+    return x is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(
+            f'{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.'
+        )
+    return total_params
+
+
+def instantiate_from_config(config):
+    if 'target' not in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == '__is_unconditional__':
+            return None
+        raise KeyError('Expected key `target` to instantiate.')
+    return get_obj_from_str(config['target'])(**config.get('params', dict()))
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit('.', 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(self,
+                 params,
+                 lr=1.e-3,
+                 betas=(0.9, 0.999),
+                 eps=1.e-8,
+                 weight_decay=1.e-2,
+                 amsgrad=False,
+                 ema_decay=0.9999,
+                 ema_power=1.,
+                 param_names=()):
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError('Invalid learning rate: {}'.format(lr))
+        if not 0.0 <= eps:
+            raise ValueError('Invalid epsilon value: {}'.format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError('Invalid beta parameter at index 0: {}'.format(
+                betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError('Invalid beta parameter at index 1: {}'.format(
+                betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError(
+                'Invalid weight_decay value: {}'.format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError('Invalid ema_decay value: {}'.format(ema_decay))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            ema_decay=ema_decay,
+            ema_power=ema_power,
+            param_names=param_names)
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError(
+                        'AdamW does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+
+            optim._functional.adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=amsgrad,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group['lr'],
+                weight_decay=group['weight_decay'],
+                eps=group['eps'],
+                maximize=False)
+
+            cur_ema_decay = min(ema_decay, 1 - state['step']**-ema_power)
+            for param, ema_param in zip(params_with_grad,
+                                        ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(
+                    param.float(), alpha=1 - cur_ema_decay)
+
+        return loss
diff --git a/modelscope/models/cv/body_3d_keypoints/__init__.py b/modelscope/models/cv/body_3d_keypoints/__init__.py
index 2672ba9a8..1c08aa247 100644
--- a/modelscope/models/cv/body_3d_keypoints/__init__.py
+++ b/modelscope/models/cv/body_3d_keypoints/__init__.py
@@ -4,11 +4,11 @@
 from modelscope.utils.import_utils import LazyImportModule
 
 if TYPE_CHECKING:
-    from .cannonical_pose import BodyKeypointsDetection3D
+    from .canonical_pose import BodyKeypointsDetection3D
     from .hdformer import HDFormerDetector
 else:
     _import_structure = {
-        'cannonical_pose': ['BodyKeypointsDetection3D'],
+        'canonical_pose': ['BodyKeypointsDetection3D'],
         'hdformer': ['HDFormerDetector'],
     }
 
diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose/__init__.py
similarity index 100%
rename from modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py
rename to modelscope/models/cv/body_3d_keypoints/canonical_pose/__init__.py
diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose/body_3d_pose.py
similarity index 95%
rename from modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py
rename to modelscope/models/cv/body_3d_keypoints/canonical_pose/body_3d_pose.py
index e9c083950..57159f0cc 100644
--- a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py
+++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose/body_3d_pose.py
@@ -10,7 +10,7 @@
 from modelscope.metainfo import Models
 from modelscope.models.base.base_torch_model import TorchModel
 from modelscope.models.builder import MODELS
-from modelscope.models.cv.body_3d_keypoints.cannonical_pose.canonical_pose_modules import (
+from modelscope.models.cv.body_3d_keypoints.canonical_pose.canonical_pose_modules import (
     TemporalModel, TransCan3Dkeys)
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
@@ -218,17 +218,17 @@ def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
         w = input_video_frame_num - pad * 2
 
         lst_pose2d_rr = []
-        lst_pose2d_cannoical = []
+        lst_pose2d_canonical = []
         for i in range(pad, w + pad):
             lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1])
-            lst_pose2d_cannoical.append(pose2d_canonical[:,
+            lst_pose2d_canonical.append(pose2d_canonical[:,
                                                          i - pad:i + pad + 1])
 
-        input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0)
-        input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0)
+        input_pose2d_rr = torch.cat(lst_pose2d_canonical, axis=0)
+        input_pose2d_canonical = torch.cat(lst_pose2d_canonical, axis=0)
 
         if self.cfg.model.MODEL.USE_CANONICAL_COORDS:
-            input_pose2d_abs = input_pose2d_cannoical.clone()
+            input_pose2d_abs = input_pose2d_canonical.clone()
         else:
             input_pose2d_abs = input_pose2d_rr.clone()
             input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1]
@@ -238,8 +238,8 @@ def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr,
     def canonicalize_2Ds(self, pos2d, f, c):
         cs = np.array([c[0], c[1]]).reshape(1, 1, 2)
         fs = np.array([f[0], f[1]]).reshape(1, 1, 2)
-        canoical_2Ds = (pos2d - cs) / fs
-        return canoical_2Ds
+        canonical_2Ds = (pos2d - cs) / fs
+        return canonical_2Ds
 
     def normalize_screen_coordinates(self, X, w, h):
         assert X.shape[-1] == 2
diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose/canonical_pose_modules.py
similarity index 100%
rename from modelscope/models/cv/body_3d_keypoints/cannonical_pose/canonical_pose_modules.py
rename to modelscope/models/cv/body_3d_keypoints/canonical_pose/canonical_pose_modules.py
diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py
index 73c9b4be3..135d5f50e 100644
--- a/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py
+++ b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py
@@ -58,7 +58,7 @@ def load_model(self, load_to_cpu=False):
         self.net.eval()
 
     def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        """Proprocess of 2D input joints.
+        """Preprocess of 2D input joints.
 
         Args:
             input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints.
diff --git a/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py b/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py
index 078cc2ec8..75c65ef49 100644
--- a/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py
+++ b/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py
@@ -7,7 +7,7 @@
 
 
 def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
-    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    """Resize the sample to ensure the given size. Keeps aspect ratio.
 
     Args:
         sample (dict): sample
@@ -133,7 +133,7 @@ def get_size(self, width, height):
                     # fit height
                     scale_width = scale_height
             elif self.__resize_method == 'minimal':
-                # scale as least as possbile
+                # scale as least as possible
                 if abs(1 - scale_width) < abs(1 - scale_height):
                     # fit width
                     scale_height = scale_width
@@ -198,7 +198,7 @@ def __call__(self, sample):
 
 
 class NormalizeImage(object):
-    """Normlize image by given mean and std.
+    """Normalize image by given mean and std.
     """
 
     def __init__(self, mean, std):
diff --git a/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py b/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py
index d348d1542..1a5f3c589 100644
--- a/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py
+++ b/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py
@@ -13,7 +13,7 @@
 from torch.nn import functional as F
 
 
-def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
+def decode_output_score_and_ptss(tpMap, topk_n=200, ksize=5):
     '''
     tpMap:
     center: tpMap[1, 0, :, :]
@@ -61,7 +61,7 @@ def pred_lines(image,
 
     batch_image = torch.from_numpy(batch_image).float().cuda()
     outputs = model(batch_image)
-    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
+    pts, pts_score, vmap = decode_output_score_and_ptss(outputs, 200, 3)
     start = vmap[:, :, :2]
     end = vmap[:, :, 2:]
     dist_map = np.sqrt(np.sum((start - end)**2, axis=-1))
@@ -116,7 +116,7 @@ def pred_squares(image, model, input_shape=[512, 512], params=params_glob):
     batch_image = torch.from_numpy(batch_image).float().cuda()
     outputs = model(batch_image)
 
-    pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3)
+    pts, pts_score, vmap = decode_output_score_and_ptss(outputs, 200, 3)
     start = vmap[:, :, :2]  # (x, y)
     end = vmap[:, :, 2:]  # (x, y)
     dist_map = np.sqrt(np.sum((start - end)**2, axis=-1))
@@ -268,7 +268,7 @@ def pred_squares(image, model, input_shape=[512, 512], params=params_glob):
         | dist(inter,0), dist(inter,0), dist(inter,0), ... |
         | dist(inter,1), dist(inter,1), dist(inter,1), ... |
         ...
-    dist_inter_to_semgnet2:
+    dist_inter_to_segment2:
         | dist(inter,0), dist(inter,1), dist(inter,2), ... |
         | dist(inter,0), dist(inter,1), dist(inter,2), ... |
         ...
diff --git a/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py b/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py
index 11e33c2fe..d1fb09fcf 100644
--- a/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py
+++ b/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py
@@ -130,7 +130,7 @@ def __call__(self, oriImg):
         limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
                    [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
                    [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
-        # the middle joints heatmap correpondence
+        # the middle joints heatmap correspondence
         mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
                   [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
                   [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
diff --git a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
index 64f40da06..af050b75e 100644
--- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
+++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py
@@ -556,10 +556,10 @@ def forward(self, x):
             x = x + F.relu_(aspp_out[i] * 0.25) * pred_attn_list[i]
 
         bz = x.size(0)
-        # -- Besides, we also need to let the prediction attention be close to visable domain
+        # -- Besides, we also need to let the prediction attention be close to visible domain
         # -- Calculate the domain distance and get the weights
         # - First, detach domains
-        G_all_d = self.G_all.detach()  # use detached G_all for calulcating
+        G_all_d = self.G_all.detach()  # use detached G_all for calculating
         pred_attn_d = pred_attn.detach().view(bz, 512, 1, 1)
 
         if self.cosine == 1:
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/__init__.py b/modelscope/models/cv/dense_optical_flow_estimation/__init__.py
new file mode 100644
index 000000000..be8fc28ed
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/__init__.py
@@ -0,0 +1,21 @@
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .raft_model import DenseOpticalFlowEstimation
+
+else:
+    _import_structure = {
+        'raft_dense_optical_flow_estimation': ['DenseOpticalFlowEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/__init__.py b/modelscope/models/cv/dense_optical_flow_estimation/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/corr.py b/modelscope/models/cv/dense_optical_flow_estimation/core/corr.py
new file mode 100644
index 000000000..a0b1a27e4
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/corr.py
@@ -0,0 +1,95 @@
+import torch
+import torch.nn.functional as F
+
+from modelscope.models.cv.dense_optical_flow_estimation.core.utils.utils import (
+    bilinear_sampler, coords_grid)
+
+try:
+    import alt_cuda_corr
+except ModuleNotFoundError:
+    # alt_cuda_corr is not compiled
+    pass
+
+
+class CorrBlock:
+
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+
+        # all pairs correlation
+        corr = CorrBlock.corr(fmap1, fmap2)
+
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr = corr.reshape(batch * h1 * w1, dim, h2, w2)
+
+        self.corr_pyramid.append(corr)
+        for i in range(self.num_levels - 1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            self.corr_pyramid.append(corr)
+
+    def __call__(self, coords):
+        r = self.radius
+        coords = coords.permute(0, 2, 3, 1)
+        batch, h1, w1, _ = coords.shape
+
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            dx = torch.linspace(-r, r, 2 * r + 1, device=coords.device)
+            dy = torch.linspace(-r, r, 2 * r + 1, device=coords.device)
+            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)
+
+            centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+
+            corr = bilinear_sampler(corr, coords_lvl)
+            corr = corr.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+
+        out = torch.cat(out_pyramid, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float()
+
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht * wd)
+        fmap2 = fmap2.view(batch, dim, ht * wd)
+
+        corr = torch.matmul(fmap1.transpose(1, 2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr / torch.sqrt(torch.tensor(dim).float())
+
+
+class AlternateCorrBlock:
+
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+
+        self.pyramid = [(fmap1, fmap2)]
+        for i in range(self.num_levels):
+            fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
+            fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
+            self.pyramid.append((fmap1, fmap2))
+
+    def __call__(self, coords):
+        coords = coords.permute(0, 2, 3, 1)
+        B, H, W, _ = coords.shape
+        dim = self.pyramid[0][0].shape[1]
+
+        corr_list = []
+        for i in range(self.num_levels):
+            r = self.radius
+            fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
+            fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()
+
+            coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
+            corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
+            corr_list.append(corr.squeeze(1))
+
+        corr = torch.stack(corr_list, dim=1)
+        corr = corr.reshape(B, -1, H, W)
+        return corr / torch.sqrt(torch.tensor(dim).float())
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/datasets.py b/modelscope/models/cv/dense_optical_flow_estimation/core/datasets.py
new file mode 100644
index 000000000..eb8a85593
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/datasets.py
@@ -0,0 +1,297 @@
+# Data loading based on https://github.com/NVIDIA/flownet2-pytorch
+
+import math
+import os
+import os.path as osp
+import random
+from glob import glob
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torch.utils.data as data
+from utils import frame_utils
+from utils.augmentor import FlowAugmentor, SparseFlowAugmentor
+
+
+class FlowDataset(data.Dataset):
+
+    def __init__(self, aug_params=None, sparse=False):
+        self.augmentor = None
+        self.sparse = sparse
+        if aug_params is not None:
+            if sparse:
+                self.augmentor = SparseFlowAugmentor(**aug_params)
+            else:
+                self.augmentor = FlowAugmentor(**aug_params)
+
+        self.is_test = False
+        self.init_seed = False
+        self.flow_list = []
+        self.image_list = []
+        self.extra_info = []
+
+    def __getitem__(self, index):
+
+        if self.is_test:
+            img1 = frame_utils.read_gen(self.image_list[index][0])
+            img2 = frame_utils.read_gen(self.image_list[index][1])
+            img1 = np.array(img1).astype(np.uint8)[..., :3]
+            img2 = np.array(img2).astype(np.uint8)[..., :3]
+            img1 = torch.from_numpy(img1).permute(2, 0, 1).float()
+            img2 = torch.from_numpy(img2).permute(2, 0, 1).float()
+            return img1, img2, self.extra_info[index]
+
+        if not self.init_seed:
+            worker_info = torch.utils.data.get_worker_info()
+            if worker_info is not None:
+                torch.manual_seed(worker_info.id)
+                np.random.seed(worker_info.id)
+                random.seed(worker_info.id)
+                self.init_seed = True
+
+        index = index % len(self.image_list)
+        valid = None
+        if self.sparse:
+            flow, valid = frame_utils.readFlowKITTI(self.flow_list[index])
+        else:
+            flow = frame_utils.read_gen(self.flow_list[index])
+
+        img1 = frame_utils.read_gen(self.image_list[index][0])
+        img2 = frame_utils.read_gen(self.image_list[index][1])
+
+        flow = np.array(flow).astype(np.float32)
+        img1 = np.array(img1).astype(np.uint8)
+        img2 = np.array(img2).astype(np.uint8)
+
+        # grayscale images
+        if len(img1.shape) == 2:
+            img1 = np.tile(img1[..., None], (1, 1, 3))
+            img2 = np.tile(img2[..., None], (1, 1, 3))
+        else:
+            img1 = img1[..., :3]
+            img2 = img2[..., :3]
+
+        if self.augmentor is not None:
+            if self.sparse:
+                img1, img2, flow, valid = self.augmentor(
+                    img1, img2, flow, valid)
+            else:
+                img1, img2, flow = self.augmentor(img1, img2, flow)
+
+        img1 = torch.from_numpy(img1).permute(2, 0, 1).float()
+        img2 = torch.from_numpy(img2).permute(2, 0, 1).float()
+        flow = torch.from_numpy(flow).permute(2, 0, 1).float()
+
+        if valid is not None:
+            valid = torch.from_numpy(valid)
+        else:
+            valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000)
+
+        return img1, img2, flow, valid.float()
+
+    def __rmul__(self, v):
+        self.flow_list = v * self.flow_list
+        self.image_list = v * self.image_list
+        return self
+
+    def __len__(self):
+        return len(self.image_list)
+
+
+class MpiSintel(FlowDataset):
+
+    def __init__(self,
+                 aug_params=None,
+                 split='training',
+                 root='datasets/Sintel',
+                 dstype='clean'):
+        super(MpiSintel, self).__init__(aug_params)
+        flow_root = osp.join(root, split, 'flow')
+        image_root = osp.join(root, split, dstype)
+
+        if split == 'test':
+            self.is_test = True
+
+        for scene in os.listdir(image_root):
+            image_list = sorted(glob(osp.join(image_root, scene, '*.png')))
+            for i in range(len(image_list) - 1):
+                self.image_list += [[image_list[i], image_list[i + 1]]]
+                self.extra_info += [(scene, i)]  # scene and frame_id
+
+            if split != 'test':
+                self.flow_list += sorted(
+                    glob(osp.join(flow_root, scene, '*.flo')))
+
+
+class FlyingChairs(FlowDataset):
+
+    def __init__(self,
+                 aug_params=None,
+                 split='train',
+                 root='datasets/FlyingChairs_release/data'):
+        super(FlyingChairs, self).__init__(aug_params)
+
+        images = sorted(glob(osp.join(root, '*.ppm')))
+        flows = sorted(glob(osp.join(root, '*.flo')))
+        assert (len(images) // 2 == len(flows))
+
+        split_list = np.loadtxt('chairs_split.txt', dtype=np.int32)
+        for i in range(len(flows)):
+            xid = split_list[i]
+            if (split == 'training' and xid == 1) or (split == 'validation'
+                                                      and xid == 2):
+                self.flow_list += [flows[i]]
+                self.image_list += [[images[2 * i], images[2 * i + 1]]]
+
+
+class FlyingThings3D(FlowDataset):
+
+    def __init__(self,
+                 aug_params=None,
+                 root='datasets/FlyingThings3D',
+                 dstype='frames_cleanpass'):
+        super(FlyingThings3D, self).__init__(aug_params)
+
+        for cam in ['left']:
+            for direction in ['into_future', 'into_past']:
+                image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*')))
+                image_dirs = sorted([osp.join(f, cam) for f in image_dirs])
+
+                flow_dirs = sorted(
+                    glob(osp.join(root, 'optical_flow/TRAIN/*/*')))
+                flow_dirs = sorted(
+                    [osp.join(f, direction, cam) for f in flow_dirs])
+
+                for idir, fdir in zip(image_dirs, flow_dirs):
+                    images = sorted(glob(osp.join(idir, '*.png')))
+                    flows = sorted(glob(osp.join(fdir, '*.pfm')))
+                    for i in range(len(flows) - 1):
+                        if direction == 'into_future':
+                            self.image_list += [[images[i], images[i + 1]]]
+                            self.flow_list += [flows[i]]
+                        elif direction == 'into_past':
+                            self.image_list += [[images[i + 1], images[i]]]
+                            self.flow_list += [flows[i + 1]]
+
+
+class KITTI(FlowDataset):
+
+    def __init__(self,
+                 aug_params=None,
+                 split='training',
+                 root='datasets/KITTI'):
+        super(KITTI, self).__init__(aug_params, sparse=True)
+        if split == 'testing':
+            self.is_test = True
+
+        root = osp.join(root, split)
+        images1 = sorted(glob(osp.join(root, 'image_2/*_10.png')))
+        images2 = sorted(glob(osp.join(root, 'image_2/*_11.png')))
+
+        for img1, img2 in zip(images1, images2):
+            frame_id = img1.split('/')[-1]
+            self.extra_info += [[frame_id]]
+            self.image_list += [[img1, img2]]
+
+        if split == 'training':
+            self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png')))
+
+
+class HD1K(FlowDataset):
+
+    def __init__(self, aug_params=None, root='datasets/HD1k'):
+        super(HD1K, self).__init__(aug_params, sparse=True)
+
+        seq_ix = 0
+        while 1:
+            flows = sorted(
+                glob(
+                    os.path.join(root, 'hd1k_flow_gt',
+                                 'flow_occ/%06d_*.png' % seq_ix)))
+            images = sorted(
+                glob(
+                    os.path.join(root, 'hd1k_input',
+                                 'image_2/%06d_*.png' % seq_ix)))
+
+            if len(flows) == 0:
+                break
+
+            for i in range(len(flows) - 1):
+                self.flow_list += [flows[i]]
+                self.image_list += [[images[i], images[i + 1]]]
+
+            seq_ix += 1
+
+
+def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'):
+    """ Create the data loader for the corresponding trainign set """
+
+    if args.stage == 'chairs':
+        aug_params = {
+            'crop_size': args.image_size,
+            'min_scale': -0.1,
+            'max_scale': 1.0,
+            'do_flip': True
+        }
+        train_dataset = FlyingChairs(aug_params, split='training')
+
+    elif args.stage == 'things':
+        aug_params = {
+            'crop_size': args.image_size,
+            'min_scale': -0.4,
+            'max_scale': 0.8,
+            'do_flip': True
+        }
+        clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass')
+        final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass')
+        train_dataset = clean_dataset + final_dataset
+
+    elif args.stage == 'sintel':
+        aug_params = {
+            'crop_size': args.image_size,
+            'min_scale': -0.2,
+            'max_scale': 0.6,
+            'do_flip': True
+        }
+        things = FlyingThings3D(aug_params, dstype='frames_cleanpass')
+        sintel_clean = MpiSintel(aug_params, split='training', dstype='clean')
+        sintel_final = MpiSintel(aug_params, split='training', dstype='final')
+
+        if TRAIN_DS == 'C+T+K+S+H':
+            kitti = KITTI({
+                'crop_size': args.image_size,
+                'min_scale': -0.3,
+                'max_scale': 0.5,
+                'do_flip': True
+            })
+            hd1k = HD1K({
+                'crop_size': args.image_size,
+                'min_scale': -0.5,
+                'max_scale': 0.2,
+                'do_flip': True
+            })
+            train_dataset = 100 * sintel_clean + 100 * sintel_final + 200 * kitti + 5 * hd1k + things
+
+        elif TRAIN_DS == 'C+T+K/S':
+            train_dataset = 100 * sintel_clean + 100 * sintel_final + things
+
+    elif args.stage == 'kitti':
+        aug_params = {
+            'crop_size': args.image_size,
+            'min_scale': -0.2,
+            'max_scale': 0.4,
+            'do_flip': False
+        }
+        train_dataset = KITTI(aug_params, split='training')
+
+    train_loader = data.DataLoader(
+        train_dataset,
+        batch_size=args.batch_size,
+        pin_memory=False,
+        shuffle=True,
+        num_workers=4,
+        drop_last=True)
+
+    print('Training with %d image pairs' % len(train_dataset))
+    return train_loader
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/extractor.py b/modelscope/models/cv/dense_optical_flow_estimation/core/extractor.py
new file mode 100644
index 000000000..dfa8e4de9
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/extractor.py
@@ -0,0 +1,285 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ResidualBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(
+                    num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm3)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BottleneckBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(
+            in_planes, planes // 4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(
+            planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride)
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes // 4)
+            self.norm2 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes // 4)
+            self.norm3 = nn.GroupNorm(
+                num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(
+                    num_groups=num_groups, num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm4)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BasicEncoder(nn.Module):
+
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64, stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(
+            self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
+
+
+class SmallEncoder(nn.Module):
+
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(32)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(32)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32, stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(
+            self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/raft.py b/modelscope/models/cv/dense_optical_flow_estimation/core/raft.py
new file mode 100644
index 000000000..f2b801bc4
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/raft.py
@@ -0,0 +1,163 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.cv.dense_optical_flow_estimation.core.corr import (
+    AlternateCorrBlock, CorrBlock)
+from modelscope.models.cv.dense_optical_flow_estimation.core.extractor import (
+    BasicEncoder, SmallEncoder)
+from modelscope.models.cv.dense_optical_flow_estimation.core.update import (
+    BasicUpdateBlock, SmallUpdateBlock)
+from modelscope.models.cv.dense_optical_flow_estimation.core.utils.utils import (
+    bilinear_sampler, coords_grid, upflow8)
+
+autocast = torch.cuda.amp.autocast
+
+# try:
+#     autocast = torch.cuda.amp.autocast
+# except:
+#     # dummy autocast for PyTorch < 1.6
+#     class autocast:
+#         def __init__(self, enabled):
+#             pass
+#         def __enter__(self):
+#             pass
+#         def __exit__(self, *args):
+#             pass
+
+
+class RAFT(TorchModel):
+
+    def __init__(self, args):
+        super(RAFT, self).__init__()
+        self.args = args
+
+        if args.small:
+            self.hidden_dim = hdim = 96
+            self.context_dim = cdim = 64
+            args.corr_levels = 4
+            args.corr_radius = 3
+
+        else:
+            self.hidden_dim = hdim = 128
+            self.context_dim = cdim = 128
+            args.corr_levels = 4
+            args.corr_radius = 4
+
+        if 'dropout' not in self.args:
+            self.args.dropout = 0
+
+        if 'alternate_corr' not in self.args:
+            self.args.alternate_corr = False
+
+        # feature network, context network, and update block
+        if args.small:
+            self.fnet = SmallEncoder(
+                output_dim=128, norm_fn='instance', dropout=args.dropout)
+            self.cnet = SmallEncoder(
+                output_dim=hdim + cdim, norm_fn='none', dropout=args.dropout)
+            self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim)
+
+        else:
+            self.fnet = BasicEncoder(
+                output_dim=256, norm_fn='instance', dropout=args.dropout)
+            self.cnet = BasicEncoder(
+                output_dim=hdim + cdim, norm_fn='batch', dropout=args.dropout)
+            self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def initialize_flow(self, img):
+        """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
+        N, C, H, W = img.shape
+        coords0 = coords_grid(N, H // 8, W // 8, device=img.device)
+        coords1 = coords_grid(N, H // 8, W // 8, device=img.device)
+
+        # optical flow computed as difference: flow = coords1 - coords0
+        return coords0, coords1
+
+    def upsample_flow(self, flow, mask):
+        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+
+        up_flow = F.unfold(8 * flow, [3, 3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8 * H, 8 * W)
+
+    def forward(self,
+                image1,
+                image2,
+                iters=20,
+                flow_init=None,
+                upsample=True,
+                test_mode=False):
+        """ Estimate optical flow between pair of frames """
+
+        image1 = 2 * (image1 / 255.0) - 1.0
+        image2 = 2 * (image2 / 255.0) - 1.0
+
+        image1 = image1.contiguous()
+        image2 = image2.contiguous()
+
+        hdim = self.hidden_dim
+        cdim = self.context_dim
+
+        # run the feature network
+        with autocast(enabled=self.args.mixed_precision):
+            fmap1, fmap2 = self.fnet([image1, image2])
+
+        fmap1 = fmap1.float()
+        fmap2 = fmap2.float()
+        if self.args.alternate_corr:
+            corr_fn = AlternateCorrBlock(
+                fmap1, fmap2, radius=self.args.corr_radius)
+        else:
+            corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
+
+        # run the context network
+        with autocast(enabled=self.args.mixed_precision):
+            cnet = self.cnet(image1)
+            net, inp = torch.split(cnet, [hdim, cdim], dim=1)
+            net = torch.tanh(net)
+            inp = torch.relu(inp)
+
+        coords0, coords1 = self.initialize_flow(image1)
+
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+
+        flow_predictions = []
+        for itr in range(iters):
+            coords1 = coords1.detach()
+            corr = corr_fn(coords1)  # index correlation volume
+
+            flow = coords1 - coords0
+            with autocast(enabled=self.args.mixed_precision):
+                net, up_mask, delta_flow = self.update_block(
+                    net, inp, corr, flow)
+
+            # F(t+1) = F(t) + \Delta(t)
+            coords1 = coords1 + delta_flow
+
+            # upsample predictions
+            if up_mask is None:
+                flow_up = upflow8(coords1 - coords0)
+            else:
+                flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+
+            flow_predictions.append(flow_up)
+
+        if test_mode:
+            return coords1 - coords0, flow_up
+
+        return flow_predictions
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/update.py b/modelscope/models/cv/dense_optical_flow_estimation/core/update.py
new file mode 100644
index 000000000..b43bb0ecd
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/update.py
@@ -0,0 +1,157 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowHead(nn.Module):
+
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+
+
+class ConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, 3, padding=1)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+
+        h = (1 - z) * h + z * q
+        return h
+
+
+class SepConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim=128, input_dim=192 + 128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convr1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+        self.convq1 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2))
+
+        self.convz2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convr2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+        self.convq2 = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0))
+
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+
+        return h
+
+
+class SmallMotionEncoder(nn.Module):
+
+    def __init__(self, args):
+        super(SmallMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
+        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
+        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
+        self.conv = nn.Conv2d(128, 80, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class BasicMotionEncoder(nn.Module):
+
+    def __init__(self, args):
+        super(BasicMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64 + 192, 128 - 2, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+
+class SmallUpdateBlock(nn.Module):
+
+    def __init__(self, args, hidden_dim=96):
+        super(SmallUpdateBlock, self).__init__()
+        self.encoder = SmallMotionEncoder(args)
+        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82 + 64)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)
+
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        return net, None, delta_flow
+
+
+class BasicUpdateBlock(nn.Module):
+
+    def __init__(self, args, hidden_dim=128, input_dim=128):
+        super(BasicUpdateBlock, self).__init__()
+        self.args = args
+        self.encoder = BasicMotionEncoder(args)
+        self.gru = SepConvGRU(
+            hidden_dim=hidden_dim, input_dim=128 + hidden_dim)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True),
+            nn.Conv2d(256, 64 * 9, 1, padding=0))
+
+    def forward(self, net, inp, corr, flow, upsample=True):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        # scale mask to balence gradients
+        mask = .25 * self.mask(net)
+        return net, mask, delta_flow
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/__init__.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/augmentor.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/augmentor.py
new file mode 100644
index 000000000..ff1b70dcb
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/augmentor.py
@@ -0,0 +1,286 @@
+import math
+import random
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image
+from torchvision.transforms import ColorJitter
+
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+
+class FlowAugmentor:
+
+    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True):
+
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = 0.8
+        self.stretch_prob = 0.8
+        self.max_stretch = 0.2
+
+        # flip augmentation params
+        self.do_flip = do_flip
+        self.h_flip_prob = 0.5
+        self.v_flip_prob = 0.1
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(
+            brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5 / 3.14)
+        self.asymmetric_color_aug_prob = 0.2
+        self.eraser_aug_prob = 0.5
+
+    def color_transform(self, img1, img2):
+        """ Photometric augmentation """
+
+        # asymmetric
+        if np.random.rand() < self.asymmetric_color_aug_prob:
+            img1 = np.array(
+                self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
+            img2 = np.array(
+                self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)
+
+        # symmetric
+        else:
+            image_stack = np.concatenate([img1, img2], axis=0)
+            image_stack = np.array(
+                self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
+            img1, img2 = np.split(image_stack, 2, axis=0)
+
+        return img1, img2
+
+    def eraser_transform(self, img1, img2, bounds=[50, 100]):
+        """ Occlusion augmentation """
+
+        ht, wd = img1.shape[:2]
+        if np.random.rand() < self.eraser_aug_prob:
+            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
+            for _ in range(np.random.randint(1, 3)):
+                x0 = np.random.randint(0, wd)
+                y0 = np.random.randint(0, ht)
+                dx = np.random.randint(bounds[0], bounds[1])
+                dy = np.random.randint(bounds[0], bounds[1])
+                img2[y0:y0 + dy, x0:x0 + dx, :] = mean_color
+
+        return img1, img2
+
+    def spatial_transform(self, img1, img2, flow):
+        # randomly sample scale
+        ht, wd = img1.shape[:2]
+        min_scale = np.maximum((self.crop_size[0] + 8) / float(ht),
+                               (self.crop_size[1] + 8) / float(wd))
+
+        scale = 2**np.random.uniform(self.min_scale, self.max_scale)
+        scale_x = scale
+        scale_y = scale
+        if np.random.rand() < self.stretch_prob:
+            scale_x *= 2**np.random.uniform(-self.max_stretch,
+                                            self.max_stretch)
+            scale_y *= 2**np.random.uniform(-self.max_stretch,
+                                            self.max_stretch)
+
+        scale_x = np.clip(scale_x, min_scale, None)
+        scale_y = np.clip(scale_y, min_scale, None)
+
+        if np.random.rand() < self.spatial_aug_prob:
+            # rescale the images
+            img1 = cv2.resize(
+                img1,
+                None,
+                fx=scale_x,
+                fy=scale_y,
+                interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(
+                img2,
+                None,
+                fx=scale_x,
+                fy=scale_y,
+                interpolation=cv2.INTER_LINEAR)
+            flow = cv2.resize(
+                flow,
+                None,
+                fx=scale_x,
+                fy=scale_y,
+                interpolation=cv2.INTER_LINEAR)
+            flow = flow * [scale_x, scale_y]
+
+        if self.do_flip:
+            if np.random.rand() < self.h_flip_prob:  # h-flip
+                img1 = img1[:, ::-1]
+                img2 = img2[:, ::-1]
+                flow = flow[:, ::-1] * [-1.0, 1.0]
+
+            if np.random.rand() < self.v_flip_prob:  # v-flip
+                img1 = img1[::-1, :]
+                img2 = img2[::-1, :]
+                flow = flow[::-1, :] * [1.0, -1.0]
+
+        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+        x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+
+        img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+
+        return img1, img2, flow
+
+    def __call__(self, img1, img2, flow):
+        img1, img2 = self.color_transform(img1, img2)
+        img1, img2 = self.eraser_transform(img1, img2)
+        img1, img2, flow = self.spatial_transform(img1, img2, flow)
+
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+
+        return img1, img2, flow
+
+
+class SparseFlowAugmentor:
+
+    def __init__(self,
+                 crop_size,
+                 min_scale=-0.2,
+                 max_scale=0.5,
+                 do_flip=False):
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = 0.8
+        self.stretch_prob = 0.8
+        self.max_stretch = 0.2
+
+        # flip augmentation params
+        self.do_flip = do_flip
+        self.h_flip_prob = 0.5
+        self.v_flip_prob = 0.1
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(
+            brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3 / 3.14)
+        self.asymmetric_color_aug_prob = 0.2
+        self.eraser_aug_prob = 0.5
+
+    def color_transform(self, img1, img2):
+        image_stack = np.concatenate([img1, img2], axis=0)
+        image_stack = np.array(
+            self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
+        img1, img2 = np.split(image_stack, 2, axis=0)
+        return img1, img2
+
+    def eraser_transform(self, img1, img2):
+        ht, wd = img1.shape[:2]
+        if np.random.rand() < self.eraser_aug_prob:
+            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
+            for _ in range(np.random.randint(1, 3)):
+                x0 = np.random.randint(0, wd)
+                y0 = np.random.randint(0, ht)
+                dx = np.random.randint(50, 100)
+                dy = np.random.randint(50, 100)
+                img2[y0:y0 + dy, x0:x0 + dx, :] = mean_color
+
+        return img1, img2
+
+    def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0):
+        ht, wd = flow.shape[:2]
+        coords = np.meshgrid(np.arange(wd), np.arange(ht))
+        coords = np.stack(coords, axis=-1)
+
+        coords = coords.reshape(-1, 2).astype(np.float32)
+        flow = flow.reshape(-1, 2).astype(np.float32)
+        valid = valid.reshape(-1).astype(np.float32)
+
+        coords0 = coords[valid >= 1]
+        flow0 = flow[valid >= 1]
+
+        ht1 = int(round(ht * fy))
+        wd1 = int(round(wd * fx))
+
+        coords1 = coords0 * [fx, fy]
+        flow1 = flow0 * [fx, fy]
+
+        xx = np.round(coords1[:, 0]).astype(np.int32)
+        yy = np.round(coords1[:, 1]).astype(np.int32)
+
+        v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+        xx = xx[v]
+        yy = yy[v]
+        flow1 = flow1[v]
+
+        flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32)
+        valid_img = np.zeros([ht1, wd1], dtype=np.int32)
+
+        flow_img[yy, xx] = flow1
+        valid_img[yy, xx] = 1
+
+        return flow_img, valid_img
+
+    def spatial_transform(self, img1, img2, flow, valid):
+        # randomly sample scale
+
+        ht, wd = img1.shape[:2]
+        min_scale = np.maximum((self.crop_size[0] + 1) / float(ht),
+                               (self.crop_size[1] + 1) / float(wd))
+
+        scale = 2**np.random.uniform(self.min_scale, self.max_scale)
+        scale_x = np.clip(scale, min_scale, None)
+        scale_y = np.clip(scale, min_scale, None)
+
+        if np.random.rand() < self.spatial_aug_prob:
+            # rescale the images
+            img1 = cv2.resize(
+                img1,
+                None,
+                fx=scale_x,
+                fy=scale_y,
+                interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(
+                img2,
+                None,
+                fx=scale_x,
+                fy=scale_y,
+                interpolation=cv2.INTER_LINEAR)
+            flow, valid = self.resize_sparse_flow_map(
+                flow, valid, fx=scale_x, fy=scale_y)
+
+        if self.do_flip:
+            if np.random.rand() < 0.5:  # h-flip
+                img1 = img1[:, ::-1]
+                img2 = img2[:, ::-1]
+                flow = flow[:, ::-1] * [-1.0, 1.0]
+                valid = valid[:, ::-1]
+
+        margin_y = 20
+        margin_x = 50
+
+        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y)
+        x0 = np.random.randint(-margin_x,
+                               img1.shape[1] - self.crop_size[1] + margin_x)
+
+        y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0])
+        x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1])
+
+        img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        valid = valid[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]]
+        return img1, img2, flow, valid
+
+    def __call__(self, img1, img2, flow, valid):
+        img1, img2 = self.color_transform(img1, img2)
+        img1, img2 = self.eraser_transform(img1, img2)
+        img1, img2, flow, valid = self.spatial_transform(
+            img1, img2, flow, valid)
+
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+        valid = np.ascontiguousarray(valid)
+
+        return img1, img2, flow, valid
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/flow_viz.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/flow_viz.py
new file mode 100644
index 000000000..46c92e348
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/flow_viz.py
@@ -0,0 +1,132 @@
+# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization
+
+# MIT License
+#
+# Copyright (c) 2018 Tom Runia
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to conditions.
+#
+# Author: Tom Runia
+# Date Created: 2018-08-03
+
+import numpy as np
+
+
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+
+    Returns:
+        np.ndarray: Color wheel
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
+    col = col + RY
+    # YG
+    colorwheel[col:col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
+    colorwheel[col:col + YG, 1] = 255
+    col = col + YG
+    # GC
+    colorwheel[col:col + GC, 1] = 255
+    colorwheel[col:col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
+    col = col + GC
+    # CB
+    colorwheel[col:col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
+    colorwheel[col:col + CB, 2] = 255
+    col = col + CB
+    # BM
+    colorwheel[col:col + BM, 2] = 255
+    colorwheel[col:col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
+    col = col + BM
+    # MR
+    colorwheel[col:col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
+    colorwheel[col:col + MR, 0] = 255
+    return colorwheel
+
+
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u) / np.pi
+    fk = (a + 1) / 2 * (ncols - 1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1 - f) * col0 + f * col1
+        idx = (rad <= 1)
+        col[idx] = 1 - rad[idx] * (1 - col[idx])
+        col[~idx] = col[~idx] * 0.75  # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2 - i if convert_to_bgr else i
+        flow_image[:, :, ch_idx] = np.floor(255 * col)
+    return flow_image
+
+
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:, :, 0]
+    v = flow_uv[:, :, 1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/frame_utils.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/frame_utils.py
new file mode 100644
index 000000000..dac10fe1e
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/frame_utils.py
@@ -0,0 +1,142 @@
+import re
+from os.path import *
+
+import cv2
+import numpy as np
+from PIL import Image
+
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+TAG_CHAR = np.array([202021.25], np.float32)
+
+
+def readFlow(fn):
+    """ Read .flo file in Middlebury format"""
+    # Code adapted from:
+    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
+
+    # WARNING: this will work on little-endian architectures (eg Intel x86) only!
+    # print 'fn = %s'%(fn)
+    with open(fn, 'rb') as f:
+        magic = np.fromfile(f, np.float32, count=1)
+        if 202021.25 != magic:
+            print('Magic number incorrect. Invalid .flo file')
+            return None
+        else:
+            w = np.fromfile(f, np.int32, count=1)
+            h = np.fromfile(f, np.int32, count=1)
+            # print 'Reading %d x %d flo file\n' % (w, h)
+            data = np.fromfile(f, np.float32, count=2 * int(w) * int(h))
+            # Reshape data into 3D array (columns, rows, bands)
+            # The reshape here is for visualization, the original code is (w,h,2)
+            return np.resize(data, (int(h), int(w), 2))
+
+
+def readPFM(file):
+    file = open(file, 'rb')
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header == b'PF':
+        color = True
+    elif header == b'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data
+
+
+def writeFlow(filename, uv, v=None):
+    """ Write optical flow to file.
+
+    If v is None, uv is assumed to contain both u and v channels,
+    stacked in depth.
+    Original code by Deqing Sun, adapted from Daniel Scharstein.
+    """
+    nBands = 2
+
+    if v is None:
+        assert (uv.ndim == 3)
+        assert (uv.shape[2] == 2)
+        u = uv[:, :, 0]
+        v = uv[:, :, 1]
+    else:
+        u = uv
+
+    assert (u.shape == v.shape)
+    height, width = u.shape
+    f = open(filename, 'wb')
+    # write the header
+    f.write(TAG_CHAR)
+    np.array(width).astype(np.int32).tofile(f)
+    np.array(height).astype(np.int32).tofile(f)
+    # arrange into matrix form
+    tmp = np.zeros((height, width * nBands))
+    tmp[:, np.arange(width) * 2] = u
+    tmp[:, np.arange(width) * 2 + 1] = v
+    tmp.astype(np.float32).tofile(f)
+    f.close()
+
+
+def readFlowKITTI(filename):
+    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    flow, valid = flow[:, :, :2], flow[:, :, 2]
+    flow = (flow - 2**15) / 64.0
+    return flow, valid
+
+
+def readDispKITTI(filename):
+    disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0
+    valid = disp > 0.0
+    flow = np.stack([-disp, np.zeros_like(disp)], -1)
+    return flow, valid
+
+
+def writeFlowKITTI(filename, uv):
+    uv = 64.0 * uv + 2**15
+    valid = np.ones([uv.shape[0], uv.shape[1], 1])
+    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
+    cv2.imwrite(filename, uv[..., ::-1])
+
+
+def read_gen(file_name, pil=False):
+    ext = splitext(file_name)[-1]
+    if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg':
+        return Image.open(file_name)
+    elif ext == '.bin' or ext == '.raw':
+        return np.load(file_name)
+    elif ext == '.flo':
+        return readFlow(file_name).astype(np.float32)
+    elif ext == '.pfm':
+        flow = readPFM(file_name).astype(np.float32)
+        if len(flow.shape) == 2:
+            return flow
+        else:
+            return flow[:, :, :-1]
+    return []
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/utils.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/utils.py
new file mode 100644
index 000000000..6228e6ef4
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/utils.py
@@ -0,0 +1,93 @@
+import numpy as np
+import torch
+import torch.nn.functional as F
+from scipy import interpolate
+
+
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [
+                pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2,
+                pad_ht - pad_ht // 2
+            ]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+
+
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+
+    x1 = x0 + dx
+    y1 = y0 + dy
+
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+
+    flow_x = interpolate.griddata((x1, y1),
+                                  dx, (x0, y0),
+                                  method='nearest',
+                                  fill_value=0)
+
+    flow_y = interpolate.griddata((x1, y1),
+                                  dy, (x0, y0),
+                                  method='nearest',
+                                  fill_value=0)
+
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+
+
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1, 1], dim=-1)
+    xgrid = 2 * xgrid / (W - 1) - 1
+    ygrid = 2 * ygrid / (H - 1) - 1
+
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+
+    return img
+
+
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(
+        torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+def upflow8(flow, mode='bilinear'):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return 8 * F.interpolate(
+        flow, size=new_size, mode=mode, align_corners=True)
diff --git a/modelscope/models/cv/dense_optical_flow_estimation/raft_model.py b/modelscope/models/cv/dense_optical_flow_estimation/raft_model.py
new file mode 100644
index 000000000..2363092ae
--- /dev/null
+++ b/modelscope/models/cv/dense_optical_flow_estimation/raft_model.py
@@ -0,0 +1,52 @@
+import argparse
+import os.path as osp
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.dense_optical_flow_estimation.core.raft import RAFT
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.dense_optical_flow_estimation,
+    module_name=Models.raft_dense_optical_flow_estimation)
+class DenseOpticalFlowEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        # build model
+        args = argparse.Namespace()
+        args.model = model_dir
+        args.small = False
+        args.mixed_precision = False
+        args.alternate_corr = False
+        self.model = torch.nn.DataParallel(RAFT(args))
+
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        self.model.load_state_dict(torch.load(model_path))
+        self.model = self.model.module
+        self.model.to('cuda')
+        self.model.eval()
+
+    def forward(self, Inputs):
+        image1 = Inputs['image1']
+        image2 = Inputs['image2']
+
+        flow_ups = self.model(image1, image2)
+        flow_up = flow_ups[-1]
+
+        return flow_up
+
+    def postprocess(self, inputs):
+        results = {OutputKeys.FLOWS: inputs}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+        return results
diff --git a/modelscope/models/cv/face_detection/mogface/models/resnet.py b/modelscope/models/cv/face_detection/mogface/models/resnet.py
index 045f6fa37..dc0023c3b 100644
--- a/modelscope/models/cv/face_detection/mogface/models/resnet.py
+++ b/modelscope/models/cv/face_detection/mogface/models/resnet.py
@@ -1,6 +1,6 @@
-# The implementation is modified from original resent implementaiton, which is
-#  also open-sourced by the authors as Yang Liu,
-#  and is available publicly on  https://github.com/damo-cv/MogFace
+# The implementation is modified from original resent implementation, which is
+# also open-sourced by the authors as Yang Liu,
+# and is available publicly on  https://github.com/damo-cv/MogFace
 
 import torch.nn as nn
 
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py
index 11a59302f..545cfb18e 100644
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py
@@ -27,7 +27,7 @@ def __init__(self,
         """
         Any ReLU-CNN Backbone
         Args:
-        plainet_struct: (obj: str):
+        plainnet_struct: (obj: str):
             Str of network topology structure.
         no_reslink: (obj:bool):
             no use residual structure.
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py
index 3bae34d83..cee49276c 100644
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py
@@ -1,5 +1,5 @@
 """
-The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+The implementation here is modified based on insightface, originally MIT license and publicly available at
 https://github.com/deepinsight/insightface/blob/master/detection/scrfd/mmdet/models/detectors/base.py
 """
 from abc import ABCMeta, abstractmethod
diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py
index 117eaa82a..9f77f7953 100644
--- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py
+++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py
@@ -1,5 +1,5 @@
 """
-The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
+The implementation here is modified based on insightface, originally MIT license and publicly available at
 https://github.com/deepinsight/insightface/blob/master/detection/scrfd/mmdet/models/detectors/single_stage.py
 """
 import torch
diff --git a/modelscope/models/cv/face_emotion/efficient/utils.py b/modelscope/models/cv/face_emotion/efficient/utils.py
index c1fcd9b3c..e4a79ac65 100644
--- a/modelscope/models/cv/face_emotion/efficient/utils.py
+++ b/modelscope/models/cv/face_emotion/efficient/utils.py
@@ -207,7 +207,7 @@ def forward(self, x):
 
 class Conv2dStaticSamePadding(nn.Conv2d):
     """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size.
-       The padding mudule is calculated in construction function, then used in forward.
+       The padding module is calculated in construction function, then used in forward.
     """
 
     def __init__(self,
diff --git a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
index cad6cfe00..91d5379a3 100644
--- a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
+++ b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py
@@ -186,7 +186,7 @@ class GhostBlocks(nn.Module):
         out_channels (int): Number of output channels.
         expand (int): Expand ratio of GhostBottleneck. Default: 1.
         kernel_size (int): Kernel size of depthwise convolution. Default: 5.
-        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
+        num_blocks (int): Number of GhostBottleneck blocks. Default: 1.
         use_res (bool): Whether to use residual connection. Default: False.
         activation (str): Name of activation function. Default: LeakyReLU.
     """
@@ -242,7 +242,7 @@ class GhostPAN(nn.Module):
             blocks. Default: False
         kernel_size (int): Kernel size of depthwise convolution. Default: 5.
         expand (int): Expand ratio of GhostBottleneck. Default: 1.
-        num_blocks (int): Number of GhostBottlecneck blocks. Default: 1.
+        num_blocks (int): Number of GhostBottleneck blocks. Default: 1.
         use_res (bool): Whether to use residual connection. Default: False.
         num_extra_level (int): Number of extra conv layers for more feature levels.
             Default: 0.
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
index afe899632..b8c0eeb52 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone
 from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50,
                          IR_SE_101, IR_SE_152, IR_SE_200)
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py b/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py
index 25b9fe332..ed0f41f8b 100644
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at
 # https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/backbones/iresnet.py
 import torch
 from torch import nn
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
index a1683225e..9876bd291 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/common.py
 import torch
 import torch.nn as nn
@@ -7,7 +7,7 @@
 
 
 def initialize_weights(modules):
-    """ Weight initilize, conv2d and linear is initialized with kaiming_normal
+    """ Weight initialize, conv2d and linear is initialized with kaiming_normal
     """
     for m in modules:
         if isinstance(m, nn.Conv2d):
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py b/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py
index c9e01367e..d049ea42e 100644
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from InsightFace, made pubicly available under the Apache-2.0 license at
+# The implementation is adopted from InsightFace, made publicly available under the Apache-2.0 license at
 # https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py
 
 from collections import namedtuple
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
index 1982ca059..8e9f5f530 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_irse.py
 from collections import namedtuple
 
diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
index 568e24ffc..479e7dd4e 100755
--- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
+++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at
+# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at
 # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_resnet.py
 import torch.nn as nn
 from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
diff --git a/modelscope/models/cv/face_reconstruction/models/facerecon_model.py b/modelscope/models/cv/face_reconstruction/models/facerecon_model.py
index d753b4163..008e0780f 100644
--- a/modelscope/models/cv/face_reconstruction/models/facerecon_model.py
+++ b/modelscope/models/cv/face_reconstruction/models/facerecon_model.py
@@ -104,7 +104,7 @@ def __init__(self,
             zfar=opt.z_far,
             rasterize_size=int(2 * opt.center))
 
-        self.comupte_color_loss = photo_loss
+        self.compute_color_loss = photo_loss
 
     def set_device(self, device):
         self.device = device
@@ -444,7 +444,7 @@ def forward(self, visualize=False):
                         self.facemodel_front.face_buf, self.bfm_UVs.clone(),
                         pred_color_high)
 
-                    loss_color_high = self.w_color * self.comupte_color_loss(
+                    loss_color_high = self.w_color * self.compute_color_loss(
                         pred_face_high, self.input_img_for_tex,
                         self.pred_mask.detach())
                     loss_smooth = TVLoss()(texture_offset) * self.w_tex_smooth
diff --git a/modelscope/models/cv/face_reconstruction/models/losses.py b/modelscope/models/cv/face_reconstruction/models/losses.py
index 6d4af4e8d..c04a81661 100644
--- a/modelscope/models/cv/face_reconstruction/models/losses.py
+++ b/modelscope/models/cv/face_reconstruction/models/losses.py
@@ -49,7 +49,7 @@ def perceptual_loss(id_featureA, id_featureB):
 # image level loss
 def photo_loss(imageA, imageB, mask, eps=1e-6):
     """
-    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    l2 norm (with sqrt, to ensure backward stability, use eps, otherwise Nan may occur)
     Parameters:
         imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order
         imageB       --same as imageA
@@ -170,7 +170,7 @@ def _tensor_size(self, t):
 
 def photo_loss_sum(imageA, imageB, mask, eps=1e-6):
     """
-    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    l2 norm (with sqrt, to ensure backward stability, use eps, otherwise Nan may occur)
     Parameters:
         imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order
         imageB       --same as imageA
diff --git a/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py b/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py
index c18881edc..5a8a4709e 100644
--- a/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py
+++ b/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py
@@ -322,7 +322,7 @@ def get_target_tensor(self, prediction, target_is_real):
         """Create label tensors with the same size as the input.
 
         Parameters:
-            prediction (tensor) - - tpyically the prediction from a discriminator
+            prediction (tensor) - - typically the prediction from a discriminator
             target_is_real (bool) - - if the ground truth label is for real images or fake images
 
         Returns:
@@ -336,10 +336,10 @@ def get_target_tensor(self, prediction, target_is_real):
         return target_tensor.expand_as(prediction)
 
     def __call__(self, prediction, target_is_real):
-        """Calculate loss given Discriminator's output and grount truth labels.
+        """Calculate loss given Discriminator's output and ground truth labels.
 
         Parameters:
-            prediction (tensor) - - tpyically the prediction output from a discriminator
+            prediction (tensor) - - typically the prediction output from a discriminator
             target_is_real (bool) - - if the ground truth label is for real images or fake images
 
         Returns:
diff --git a/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py b/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py
index 54768fc1c..b9c2c9000 100644
--- a/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py
+++ b/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py
@@ -13,7 +13,7 @@ class Pix2PixModel(nn.Module):
     The model training requires '--dataset_mode aligned' dataset.
     By default, it uses a '--netG unet256' U-Net generator,
     a '--netD basic' discriminator (PatchGAN),
-    and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the orignal GAN paper).
+    and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the original GAN paper).
 
     pix2pix paper: https://arxiv.org/pdf/1611.07004.pdf
     """
@@ -121,5 +121,5 @@ def optimize_parameters(self):
         self.set_requires_grad(
             self.netD, False)  # D requires no gradients when optimizing G
         self.optimizer_G.zero_grad()  # set G's gradients to zero
-        self.backward_G()  # calculate graidents for G
-        self.optimizer_G.step()  # udpate G's weights
+        self.backward_G()  # calculate gradients for G
+        self.optimizer_G.step()  # update G's weights
diff --git a/modelscope/models/cv/face_reconstruction/models/renderer.py b/modelscope/models/cv/face_reconstruction/models/renderer.py
index d10fd5604..bfe166b0c 100755
--- a/modelscope/models/cv/face_reconstruction/models/renderer.py
+++ b/modelscope/models/cv/face_reconstruction/models/renderer.py
@@ -20,7 +20,7 @@ def set_rasterizer():
 
 
 class Pytorch3dRasterizer(nn.Module):
-    # TODO: add support for rendering non-squared images, since pytorc3d supports this now
+    # TODO: add support for rendering non-squared images, since pytorch3d supports this now
     """  Borrowed from https://github.com/facebookresearch/pytorch3d
     Notice:
         x,y,z are in image space, normalized
@@ -158,7 +158,7 @@ def forward(self,
             -- Texture Rendering
             vertices: [batch_size, V, 3], vertices in world space, for calculating normals, then shading
             transformed_vertices: [batch_size, V, 3], range:normalized to [-1,1], projected vertices in image space
-                                    (that is aligned to the iamge pixel), for rasterization
+                                    (that is aligned to the image pixel), for rasterization
             albedos: [batch_size, 3, h, w], uv map
             lights:
                 spherical homarnic: [N, 9(shcoeff), 3(rgb)]
diff --git a/modelscope/models/cv/face_reconstruction/utils.py b/modelscope/models/cv/face_reconstruction/utils.py
index 655d8b2a7..f23b2f707 100644
--- a/modelscope/models/cv/face_reconstruction/utils.py
+++ b/modelscope/models/cv/face_reconstruction/utils.py
@@ -767,6 +767,7 @@ def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.):
 
     # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face
     t, s = POS(lm5p.transpose(), lm3D.transpose())
+    t = t.squeeze()
     s = rescale_factor / s
 
     # processing the image
diff --git a/modelscope/models/cv/facial_68ldk_detection/__init__.py b/modelscope/models/cv/facial_68ldk_detection/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/facial_68ldk_detection/conf/__init__.py b/modelscope/models/cv/facial_68ldk_detection/conf/__init__.py
new file mode 100644
index 000000000..4690762b4
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/conf/__init__.py
@@ -0,0 +1 @@
+from .alignment import Alignment
diff --git a/modelscope/models/cv/facial_68ldk_detection/conf/alignment.py b/modelscope/models/cv/facial_68ldk_detection/conf/alignment.py
new file mode 100644
index 000000000..30b5773d7
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/conf/alignment.py
@@ -0,0 +1,353 @@
+import os.path as osp
+
+from .base import Base
+
+
+class Alignment(Base):
+    """
+    Alignment configure file, which contains training parameters of alignment.
+    """
+
+    def __init__(self, args):
+        super(Alignment, self).__init__('alignment')
+        self.ckpt_dir = '/mnt/workspace/humanAIGC/project/STAR/weights'
+        self.net = 'stackedHGnet_v1'
+        self.nstack = 4
+        self.loader_type = 'alignment'
+        self.data_definition = '300W'  # COFW, 300W, WFLW
+        self.test_file = 'test.tsv'
+
+        # image
+        self.channels = 3
+        self.width = 256
+        self.height = 256
+        self.means = (127.5, 127.5, 127.5)
+        self.scale = 1 / 127.5
+        self.aug_prob = 1.0
+
+        self.display_iteration = 10
+        self.val_epoch = 1
+        self.valset = 'test.tsv'
+        self.norm_type = 'default'
+        self.encoder_type = 'default'
+        self.decoder_type = 'default'
+
+        # scheduler & optimizer
+        self.milestones = [200, 350, 450]
+        self.max_epoch = 260
+        self.optimizer = 'adam'
+        self.learn_rate = 0.001
+        self.weight_decay = 0.00001
+        self.betas = [0.9, 0.999]
+        self.gamma = 0.1
+
+        # batch_size & workers
+        self.batch_size = 32
+        self.train_num_workers = 16
+        self.val_batch_size = 32
+        self.val_num_workers = 16
+        self.test_batch_size = 16
+        self.test_num_workers = 0
+
+        # tricks
+        self.ema = True
+        self.add_coord = True
+        self.use_AAM = True
+
+        # loss
+        self.loss_func = 'STARLoss_v2'
+
+        # STAR Loss paras
+        self.star_w = 1
+        self.star_dist = 'smoothl1'
+
+        self.init_from_args(args)
+
+        # COFW
+        if self.data_definition == 'COFW':
+            self.edge_info = (
+                (True, (0, 4, 2, 5)),  # RightEyebrow
+                (True, (1, 6, 3, 7)),  # LeftEyebrow
+                (True, (8, 12, 10, 13)),  # RightEye
+                (False, (9, 14, 11, 15)),  # LeftEye
+                (True, (18, 20, 19, 21)),  # Nose
+                (True, (22, 26, 23, 27)),  # LowerLip
+                (True, (22, 24, 23, 25)),  # UpperLip
+            )
+            if self.norm_type == 'ocular':
+                self.nme_left_index = 8  # ocular
+                self.nme_right_index = 9  # ocular
+            elif self.norm_type in ['pupil', 'default']:
+                self.nme_left_index = 16  # pupil
+                self.nme_right_index = 17  # pupil
+            else:
+                raise NotImplementedError
+            self.classes_num = [29, 7, 29]
+            self.crop_op = True
+            self.flip_mapping = (
+                [0, 1],
+                [4, 6],
+                [2, 3],
+                [5, 7],
+                [8, 9],
+                [10, 11],
+                [12, 14],
+                [16, 17],
+                [13, 15],
+                [18, 19],
+                [22, 23],
+            )
+            self.image_dir = osp.join(self.image_dir, 'COFW')
+        # 300W
+        elif self.data_definition == '300W':
+            self.edge_info = (
+                (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                         16)),  # FaceContour
+                (False, (17, 18, 19, 20, 21)),  # RightEyebrow
+                (False, (22, 23, 24, 25, 26)),  # LeftEyebrow
+                (False, (27, 28, 29, 30)),  # NoseLine
+                (False, (31, 32, 33, 34, 35)),  # Nose
+                (True, (36, 37, 38, 39, 40, 41)),  # RightEye
+                (True, (42, 43, 44, 45, 46, 47)),  # LeftEye
+                (True, (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                        59)),  # OuterLip
+                (True, (60, 61, 62, 63, 64, 65, 66, 67)),  # InnerLip
+            )
+            if self.norm_type in ['ocular', 'default']:
+                self.nme_left_index = 36  # ocular
+                self.nme_right_index = 45  # ocular
+            elif self.norm_type == 'pupil':
+                self.nme_left_index = [36, 37, 38, 39, 40, 41]  # pupil
+                self.nme_right_index = [42, 43, 44, 45, 46, 47]  # pupil
+            else:
+                raise NotImplementedError
+            self.classes_num = [68, 9, 68]
+            self.crop_op = True
+            self.flip_mapping = (
+                [0, 16],
+                [1, 15],
+                [2, 14],
+                [3, 13],
+                [4, 12],
+                [5, 11],
+                [6, 10],
+                [7, 9],
+                [17, 26],
+                [18, 25],
+                [19, 24],
+                [20, 23],
+                [21, 22],
+                [31, 35],
+                [32, 34],
+                [36, 45],
+                [37, 44],
+                [38, 43],
+                [39, 42],
+                [40, 47],
+                [41, 46],
+                [48, 54],
+                [49, 53],
+                [50, 52],
+                [61, 63],
+                [60, 64],
+                [67, 65],
+                [58, 56],
+                [59, 55],
+            )
+            self.image_dir = osp.join(self.image_dir, '300W')
+            # self.image_dir = osp.join(self.image_dir, '300VW_images')
+        # 300VW
+        elif self.data_definition == '300VW':
+            self.edge_info = (
+                (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                         16)),  # FaceContour
+                (False, (17, 18, 19, 20, 21)),  # RightEyebrow
+                (False, (22, 23, 24, 25, 26)),  # LeftEyebrow
+                (False, (27, 28, 29, 30)),  # NoseLine
+                (False, (31, 32, 33, 34, 35)),  # Nose
+                (True, (36, 37, 38, 39, 40, 41)),  # RightEye
+                (True, (42, 43, 44, 45, 46, 47)),  # LeftEye
+                (True, (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+                        59)),  # OuterLip
+                (True, (60, 61, 62, 63, 64, 65, 66, 67)),  # InnerLip
+            )
+            if self.norm_type in ['ocular', 'default']:
+                self.nme_left_index = 36  # ocular
+                self.nme_right_index = 45  # ocular
+            elif self.norm_type == 'pupil':
+                self.nme_left_index = [36, 37, 38, 39, 40, 41]  # pupil
+                self.nme_right_index = [42, 43, 44, 45, 46, 47]  # pupil
+            else:
+                raise NotImplementedError
+            self.classes_num = [68, 9, 68]
+            self.crop_op = True
+            self.flip_mapping = (
+                [0, 16],
+                [1, 15],
+                [2, 14],
+                [3, 13],
+                [4, 12],
+                [5, 11],
+                [6, 10],
+                [7, 9],
+                [17, 26],
+                [18, 25],
+                [19, 24],
+                [20, 23],
+                [21, 22],
+                [31, 35],
+                [32, 34],
+                [36, 45],
+                [37, 44],
+                [38, 43],
+                [39, 42],
+                [40, 47],
+                [41, 46],
+                [48, 54],
+                [49, 53],
+                [50, 52],
+                [61, 63],
+                [60, 64],
+                [67, 65],
+                [58, 56],
+                [59, 55],
+            )
+            self.image_dir = osp.join(self.image_dir,
+                                      '300VW_Dataset_2015_12_14')
+        # WFLW
+        elif self.data_definition == 'WFLW':
+            self.edge_info = (
+                (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+                         16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+                         29, 30, 31, 32)),  # FaceContour
+                (True, (33, 34, 35, 36, 37, 38, 39, 40, 41)),  # RightEyebrow
+                (True, (42, 43, 44, 45, 46, 47, 48, 49, 50)),  # LeftEyebrow
+                (False, (51, 52, 53, 54)),  # NoseLine
+                (False, (55, 56, 57, 58, 59)),  # Nose
+                (True, (60, 61, 62, 63, 64, 65, 66, 67)),  # RightEye
+                (True, (68, 69, 70, 71, 72, 73, 74, 75)),  # LeftEye
+                (True, (76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
+                        87)),  # OuterLip
+                (True, (88, 89, 90, 91, 92, 93, 94, 95)),  # InnerLip
+            )
+            if self.norm_type in ['ocular', 'default']:
+                self.nme_left_index = 60  # ocular
+                self.nme_right_index = 72  # ocular
+            elif self.norm_type == 'pupil':
+                self.nme_left_index = 96  # pupils
+                self.nme_right_index = 97  # pupils
+            else:
+                raise NotImplementedError
+            self.classes_num = [98, 9, 98]
+            self.crop_op = True
+            self.flip_mapping = (
+                [0, 32],
+                [1, 31],
+                [2, 30],
+                [3, 29],
+                [4, 28],
+                [5, 27],
+                [6, 26],
+                [7, 25],
+                [8, 24],
+                [9, 23],
+                [10, 22],
+                [11, 21],
+                [12, 20],
+                [13, 19],
+                [14, 18],
+                [15, 17],  # cheek
+                [33, 46],
+                [34, 45],
+                [35, 44],
+                [36, 43],
+                [37, 42],
+                [38, 50],
+                [39, 49],
+                [40, 48],
+                [41, 47],  # elbrow
+                [60, 72],
+                [61, 71],
+                [62, 70],
+                [63, 69],
+                [64, 68],
+                [65, 75],
+                [66, 74],
+                [67, 73],
+                [55, 59],
+                [56, 58],
+                [76, 82],
+                [77, 81],
+                [78, 80],
+                [87, 83],
+                [86, 84],
+                [88, 92],
+                [89, 91],
+                [95, 93],
+                [96, 97])
+            self.image_dir = osp.join(self.image_dir, 'WFLW', 'WFLW_images')
+
+        self.label_num = self.nstack * 3 if self.use_AAM else self.nstack
+        self.loss_weights, self.criterions, self.metrics = [], [], []
+        for i in range(self.nstack):
+            factor = (2**i) / (2**(self.nstack - 1))
+            if self.use_AAM:
+                self.loss_weights += [
+                    factor * weight for weight in [1.0, 10.0, 10.0]
+                ]
+                self.criterions += [self.loss_func, 'AWingLoss', 'AWingLoss']
+                self.metrics += ['NME', None, None]
+            else:
+                self.loss_weights += [factor * weight for weight in [1.0]]
+                self.criterions += [
+                    self.loss_func,
+                ]
+                self.metrics += [
+                    'NME',
+                ]
+
+        self.key_metric_index = (self.nstack - 1) * 3 if self.use_AAM else (
+            self.nstack - 1)
+
+        # data
+        self.folder = self.get_foldername()
+        self.work_dir = osp.join(self.ckpt_dir, self.data_definition,
+                                 self.folder)
+        self.model_dir = osp.join(self.work_dir, 'model')
+        self.log_dir = osp.join(self.work_dir, 'log')
+
+        self.train_tsv_file = osp.join(self.annot_dir, self.data_definition,
+                                       'train.tsv')
+        self.train_pic_dir = self.image_dir
+
+        self.val_tsv_file = osp.join(self.annot_dir, self.data_definition,
+                                     self.valset)
+        self.val_pic_dir = self.image_dir
+
+        self.test_tsv_file = osp.join(self.annot_dir, self.data_definition,
+                                      self.test_file)
+        self.test_pic_dir = self.image_dir
+
+        # self.train_tsv_file = osp.join(self.annot_dir, '300VW', "train.tsv")
+        # self.train_pic_dir = self.image_dir
+
+        # self.val_tsv_file = osp.join(self.annot_dir, '300VW', self.valset)
+        # self.val_pic_dir = self.image_dir
+
+        # self.test_tsv_file = osp.join(self.annot_dir, '300VW', self.test_file)
+        # self.test_pic_dir = self.image_dir
+
+    def get_foldername(self):
+        str = ''
+        str += '{}_{}x{}_{}_ep{}_lr{}_bs{}'.format(
+            self.data_definition, self.height, self.width, self.optimizer,
+            self.max_epoch, self.learn_rate, self.batch_size)
+        str += '_{}'.format(self.loss_func)
+        str += '_{}_{}'.format(
+            self.star_dist,
+            self.star_w) if self.loss_func == 'STARLoss' else ''
+        str += '_AAM' if self.use_AAM else ''
+        str += '_{}'.format(
+            self.valset[:-4]) if self.valset != 'test.tsv' else ''
+        str += '_{}'.format(self.id)
+        return str
diff --git a/modelscope/models/cv/facial_68ldk_detection/conf/base.py b/modelscope/models/cv/facial_68ldk_detection/conf/base.py
new file mode 100644
index 000000000..304505241
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/conf/base.py
@@ -0,0 +1,102 @@
+import logging
+import os.path as osp
+import uuid
+from argparse import Namespace
+
+# from tensorboardX import SummaryWriter
+
+
+class Base:
+    """
+    Base configure file, which contains the basic training parameters
+    and should be inherited by other attribute configure file.
+    """
+
+    def __init__(self,
+                 config_name,
+                 ckpt_dir='./',
+                 image_dir='./',
+                 annot_dir='./'):
+        self.type = config_name
+        self.id = str(uuid.uuid4())
+        self.note = ''
+
+        self.ckpt_dir = ckpt_dir
+        self.image_dir = image_dir
+        self.annot_dir = annot_dir
+
+        self.loader_type = 'alignment'
+        self.loss_func = 'STARLoss'
+
+        # train
+        self.batch_size = 128
+        self.val_batch_size = 1
+        self.test_batch_size = 32
+        self.channels = 3
+        self.width = 256
+        self.height = 256
+
+        # mean values in r, g, b channel.
+        self.means = (127, 127, 127)
+        self.scale = 0.0078125
+
+        self.display_iteration = 100
+        self.milestones = [50, 80]
+        self.max_epoch = 100
+
+        self.net = 'stackedHGnet_v1'
+        self.nstack = 4
+
+        # ["adam", "sgd"]
+        self.optimizer = 'adam'
+        self.learn_rate = 0.1
+        self.momentum = 0.01  # caffe: 0.99
+        self.weight_decay = 0.0
+        self.nesterov = False
+        self.scheduler = 'MultiStepLR'
+        self.gamma = 0.1
+
+        self.loss_weights = [1.0]
+        self.criterions = ['SoftmaxWithLoss']
+        self.metrics = ['Accuracy']
+        self.key_metric_index = 0
+        self.classes_num = [1000]
+        self.label_num = len(self.classes_num)
+
+        # model
+        self.ema = False
+        self.use_AAM = True
+
+        # visualization
+        self.writer = None
+
+        # log file
+        self.logger = None
+
+    def init_instance(self):
+        # self.writer = SummaryWriter(logdir=self.log_dir, comment=self.type)
+        log_formatter = logging.Formatter(
+            '%(asctime)s %(levelname)-8s: %(message)s')
+        root_logger = logging.getLogger()
+        file_handler = logging.FileHandler(osp.join(self.log_dir, 'log.txt'))
+        file_handler.setFormatter(log_formatter)
+        file_handler.setLevel(logging.NOTSET)
+        root_logger.addHandler(file_handler)
+        console_handler = logging.StreamHandler()
+        console_handler.setFormatter(log_formatter)
+        console_handler.setLevel(logging.NOTSET)
+        root_logger.addHandler(console_handler)
+        root_logger.setLevel(logging.NOTSET)
+        self.logger = root_logger
+
+    def __del__(self):
+        # tensorboard --logdir self.log_dir
+        if self.writer is not None:
+            # self.writer.export_scalars_to_json(self.log_dir + "visual.json")
+            self.writer.close()
+
+    def init_from_args(self, args: Namespace):
+        args_vars = vars(args)
+        for key, value in args_vars.items():
+            if hasattr(self, key) and value is not None:
+                setattr(self, key, value)
diff --git a/modelscope/models/cv/facial_68ldk_detection/infer.py b/modelscope/models/cv/facial_68ldk_detection/infer.py
new file mode 100644
index 000000000..ccc6229a0
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/infer.py
@@ -0,0 +1,204 @@
+import argparse
+import copy
+import math
+
+import cv2
+import numpy as np
+import torch
+
+# private package
+from .lib import utility
+
+
+class GetCropMatrix():
+    """
+    from_shape -> transform_matrix
+    """
+
+    def __init__(self, image_size, target_face_scale, align_corners=False):
+        self.image_size = image_size
+        self.target_face_scale = target_face_scale
+        self.align_corners = align_corners
+
+    def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center,
+                                  to_center):
+        cosv = math.cos(angle)
+        sinv = math.sin(angle)
+
+        fx, fy = from_center
+        tx, ty = to_center
+
+        acos = scale * cosv
+        asin = scale * sinv
+
+        a0 = acos
+        a1 = -asin
+        a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+        b0 = asin
+        b1 = acos
+        b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+        rot_scale_m = np.array([[a0, a1, a2], [b0, b1, b2], [0.0, 0.0, 1.0]],
+                               np.float32)
+        return rot_scale_m
+
+    def process(self, scale, center_w, center_h):
+        if self.align_corners:
+            to_w, to_h = self.image_size - 1, self.image_size - 1
+        else:
+            to_w, to_h = self.image_size, self.image_size
+
+        rot_mu = 0
+        scale_mu = self.image_size / (scale * self.target_face_scale * 200.0)
+        shift_xy_mu = (0, 0)
+        matrix = self._compose_rotate_and_scale(
+            rot_mu,
+            scale_mu,
+            shift_xy_mu,
+            from_center=[center_w, center_h],
+            to_center=[to_w / 2.0, to_h / 2.0])
+        return matrix
+
+
+class TransformPerspective():
+    """
+    image, matrix3x3 -> transformed_image
+    """
+
+    def __init__(self, image_size):
+        self.image_size = image_size
+
+    def process(self, image, matrix):
+        return cv2.warpPerspective(
+            image,
+            matrix,
+            dsize=(self.image_size, self.image_size),
+            flags=cv2.INTER_LINEAR,
+            borderValue=0)
+
+
+class TransformPoints2D():
+    """
+    points (nx2), matrix (3x3) -> points (nx2)
+    """
+
+    def process(self, srcPoints, matrix):
+        # nx3
+        desPoints = np.concatenate(
+            [srcPoints, np.ones_like(srcPoints[:, [0]])], axis=1)
+        desPoints = desPoints @ np.transpose(matrix)  # nx3
+        desPoints = desPoints[:, :2] / desPoints[:, [2, 2]]
+        return desPoints.astype(srcPoints.dtype)
+
+
+class Alignment:
+
+    def __init__(self, args, model_path, dl_framework, device_ids):
+        self.input_size = 256
+        self.target_face_scale = 1.0
+        self.dl_framework = dl_framework
+
+        # model
+        if self.dl_framework == 'pytorch':
+            # conf
+            self.config = utility.get_config(args)
+            self.config.device_id = device_ids[0]
+
+            # set environment
+            utility.set_environment(self.config)
+
+            net = utility.get_net(self.config)
+            if device_ids == [-1]:
+                checkpoint = torch.load(model_path, map_location='cpu')
+            else:
+                checkpoint = torch.load(model_path)
+            net.load_state_dict(checkpoint['net'])
+
+            if self.config.device_id == -1:
+                net = net.cpu()
+            else:
+                net = net.to(self.config.device_id)
+
+            net.eval()
+            self.alignment = net
+        else:
+            assert False
+
+        self.getCropMatrix = GetCropMatrix(
+            image_size=self.input_size,
+            target_face_scale=self.target_face_scale,
+            align_corners=True)
+        self.transformPerspective = TransformPerspective(
+            image_size=self.input_size)
+        self.transformPoints2D = TransformPoints2D()
+
+    def norm_points(self, points, align_corners=False):
+        if align_corners:
+            # [0, SIZE-1] -> [-1, +1]
+            return points / torch.tensor([
+                self.input_size - 1, self.input_size - 1
+            ]).to(points).view(1, 1, 2) * 2 - 1
+        else:
+            # [-0.5, SIZE-0.5] -> [-1, +1]
+            return (points * 2 + 1) / torch.tensor([
+                self.input_size, self.input_size
+            ]).to(points).view(1, 1, 2) - 1
+
+    def denorm_points(self, points, align_corners=False):
+        if align_corners:
+            # [-1, +1] -> [0, SIZE-1]
+            return (points + 1) / 2 * torch.tensor([
+                self.input_size - 1, self.input_size - 1
+            ]).to(points).view(1, 1, 2)
+        else:
+            # [-1, +1] -> [-0.5, SIZE-0.5]
+            return ((points + 1) * torch.tensor(  # noqa
+                [self.input_size, self.input_size]).to(points).view(1, 1,
+                                                                    2)  # noqa
+                    - 1) / 2  # noqa
+
+    def preprocess(self, image, scale, center_w, center_h):
+        matrix = self.getCropMatrix.process(scale, center_w, center_h)
+        input_tensor = self.transformPerspective.process(image, matrix)
+        input_tensor = input_tensor[np.newaxis, :]
+
+        input_tensor = torch.from_numpy(input_tensor)
+        input_tensor = input_tensor.float().permute(0, 3, 1, 2)
+        input_tensor = input_tensor / 255.0 * 2.0 - 1.0
+
+        if self.config.device_id == -1:
+            input_tensor = input_tensor.cpu()
+        else:
+            input_tensor = input_tensor.to(self.config.device_id)
+
+        return input_tensor, matrix
+
+    def postprocess(self, srcPoints, coeff):
+        # dstPoints = self.transformPoints2D.process(srcPoints, coeff)
+        # matrix^(-1) * src = dst
+        # src = matrix * dst
+        dstPoints = np.zeros(srcPoints.shape, dtype=np.float32)
+        for i in range(srcPoints.shape[0]):
+            dstPoints[i][0] = coeff[0][0] * srcPoints[i][0] + coeff[0][
+                1] * srcPoints[i][1] + coeff[0][2]
+            dstPoints[i][1] = coeff[1][0] * srcPoints[i][0] + coeff[1][
+                1] * srcPoints[i][1] + coeff[1][2]
+        return dstPoints
+
+    def analyze(self, image, scale, center_w, center_h):
+        input_tensor, matrix = self.preprocess(image, scale, center_w,
+                                               center_h)
+
+        if self.dl_framework == 'pytorch':
+            with torch.no_grad():
+                output = self.alignment(input_tensor)
+            landmarks = output[-1][0]
+        else:
+            assert False
+
+        landmarks = self.denorm_points(landmarks)
+        landmarks = landmarks.data.cpu().numpy()[0]
+        landmarks = self.postprocess(landmarks, np.linalg.inv(matrix))
+
+        return landmarks
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/__init__.py
new file mode 100644
index 000000000..a0efc10d8
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/__init__.py
@@ -0,0 +1,2 @@
+from .backbone import StackedHGNetV1
+from .utility import get_config, get_net
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/backbone/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/__init__.py
new file mode 100644
index 000000000..5bbfc2e2c
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/__init__.py
@@ -0,0 +1,5 @@
+from .stackedHGNetV1 import StackedHGNetV1
+
+__all__ = [
+    'StackedHGNetV1',
+]
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/backbone/core/coord_conv.py b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/core/coord_conv.py
new file mode 100644
index 000000000..ca37ea557
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/core/coord_conv.py
@@ -0,0 +1,187 @@
+import torch
+import torch.nn as nn
+
+
+class AddCoordsTh(nn.Module):
+
+    def __init__(self, x_dim, y_dim, with_r=False, with_boundary=False):
+        super(AddCoordsTh, self).__init__()
+        self.x_dim = x_dim
+        self.y_dim = y_dim
+        self.with_r = with_r
+        self.with_boundary = with_boundary
+
+    def forward(self, input_tensor, heatmap=None):
+        """
+        input_tensor: (batch, c, x_dim, y_dim)
+        """
+        batch_size_tensor = input_tensor.shape[0]
+
+        xx_ones = torch.ones([1, self.y_dim],
+                             dtype=torch.int32).to(input_tensor)
+        xx_ones = xx_ones.unsqueeze(-1)
+
+        xx_range = torch.arange(
+            self.x_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
+        xx_range = xx_range.unsqueeze(1)
+
+        xx_channel = torch.matmul(xx_ones.float(), xx_range.float())
+        xx_channel = xx_channel.unsqueeze(-1)
+
+        yy_ones = torch.ones([1, self.x_dim],
+                             dtype=torch.int32).to(input_tensor)
+        yy_ones = yy_ones.unsqueeze(1)
+
+        yy_range = torch.arange(
+            self.y_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor)
+        yy_range = yy_range.unsqueeze(-1)
+
+        yy_channel = torch.matmul(yy_range.float(), yy_ones.float())
+        yy_channel = yy_channel.unsqueeze(-1)
+
+        xx_channel = xx_channel.permute(0, 3, 2, 1)
+        yy_channel = yy_channel.permute(0, 3, 2, 1)
+
+        xx_channel = xx_channel / (self.x_dim - 1)
+        yy_channel = yy_channel / (self.y_dim - 1)
+
+        xx_channel = xx_channel * 2 - 1
+        yy_channel = yy_channel * 2 - 1
+
+        xx_channel = xx_channel.repeat(batch_size_tensor, 1, 1, 1)
+        yy_channel = yy_channel.repeat(batch_size_tensor, 1, 1, 1)
+
+        if self.with_boundary and heatmap is not None:
+            boundary_channel = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0)
+
+            zero_tensor = torch.zeros_like(xx_channel).to(xx_channel)
+            xx_boundary_channel = torch.where(boundary_channel > 0.05,
+                                              xx_channel, zero_tensor)
+            yy_boundary_channel = torch.where(boundary_channel > 0.05,
+                                              yy_channel, zero_tensor)
+        ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1)
+
+        if self.with_r:
+            rr = torch.sqrt(
+                torch.pow(xx_channel, 2) + torch.pow(yy_channel, 2))
+            rr = rr / torch.max(rr)
+            ret = torch.cat([ret, rr], dim=1)
+
+        if self.with_boundary and heatmap is not None:
+            ret = torch.cat([ret, xx_boundary_channel, yy_boundary_channel],
+                            dim=1)
+        return ret
+
+
+class CoordConvTh(nn.Module):
+    """CoordConv layer as in the paper."""
+
+    def __init__(self,
+                 x_dim,
+                 y_dim,
+                 with_r,
+                 with_boundary,
+                 in_channels,
+                 out_channels,
+                 first_one=False,
+                 relu=False,
+                 bn=False,
+                 *args,
+                 **kwargs):
+        super(CoordConvTh, self).__init__()
+        self.addcoords = AddCoordsTh(
+            x_dim=x_dim,
+            y_dim=y_dim,
+            with_r=with_r,
+            with_boundary=with_boundary)
+        in_channels += 2
+        if with_r:
+            in_channels += 1
+        if with_boundary and not first_one:
+            in_channels += 2
+        self.conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            *args,
+            **kwargs)
+        self.relu = nn.ReLU() if relu else None
+        self.bn = nn.BatchNorm2d(out_channels) if bn else None
+
+        self.with_boundary = with_boundary
+        self.first_one = first_one
+
+    def forward(self, input_tensor, heatmap=None):
+        assert (self.with_boundary and not self.first_one) == (
+            heatmap is not None)
+        ret = self.addcoords(input_tensor, heatmap)
+        ret = self.conv(ret)
+        if self.bn is not None:
+            ret = self.bn(ret)
+        if self.relu is not None:
+            ret = self.relu(ret)
+
+        return ret
+
+
+'''
+An alternative implementation for PyTorch with auto-infering the x-y dimensions.
+'''
+
+
+class AddCoords(nn.Module):
+
+    def __init__(self, with_r=False):
+        super().__init__()
+        self.with_r = with_r
+
+    def forward(self, input_tensor):
+        """
+        Args:
+            input_tensor: shape(batch, channel, x_dim, y_dim)
+        """
+        batch_size, _, x_dim, y_dim = input_tensor.size()
+
+        xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1).to(input_tensor)
+        yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose(
+            1, 2).to(input_tensor)
+
+        xx_channel = xx_channel / (x_dim - 1)
+        yy_channel = yy_channel / (y_dim - 1)
+
+        xx_channel = xx_channel * 2 - 1
+        yy_channel = yy_channel * 2 - 1
+
+        xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)
+        yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3)
+
+        ret = torch.cat(
+            [  # noqa
+                input_tensor,  # noqa
+                xx_channel.type_as(input_tensor),  # noqa
+                yy_channel.type_as(input_tensor)  # noqa
+            ],  # noqa
+            dim=1)  # noqa
+
+        if self.with_r:
+            rr = torch.sqrt(
+                torch.pow(xx_channel - 0.5, 2)
+                + torch.pow(yy_channel - 0.5, 2))
+            ret = torch.cat([ret, rr], dim=1)
+
+        return ret
+
+
+class CoordConv(nn.Module):
+
+    def __init__(self, in_channels, out_channels, with_r=False, **kwargs):
+        super().__init__()
+        self.addcoords = AddCoords(with_r=with_r)
+        in_channels += 2
+        if with_r:
+            in_channels += 1
+        self.conv = nn.Conv2d(in_channels, out_channels, **kwargs)
+
+    def forward(self, x):
+        ret = self.addcoords(x)
+        ret = self.conv(ret)
+        return ret
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/backbone/stackedHGNetV1.py b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/stackedHGNetV1.py
new file mode 100644
index 000000000..f330cc034
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/stackedHGNetV1.py
@@ -0,0 +1,374 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..dataset import get_decoder
+from .core.coord_conv import CoordConvTh
+
+
+class Activation(nn.Module):
+
+    def __init__(self, kind: str = 'relu', channel=None):
+        super().__init__()
+        self.kind = kind
+
+        if '+' in kind:
+            norm_str, act_str = kind.split('+')
+        else:
+            norm_str, act_str = 'none', kind
+
+        self.norm_fn = {
+            'in':
+            F.instance_norm,
+            'bn':
+            nn.BatchNorm2d(channel),
+            'bn_noaffine':
+            nn.BatchNorm2d(channel, affine=False, track_running_stats=True),
+            'none':
+            None
+        }[norm_str]
+
+        self.act_fn = {
+            'relu': F.relu,
+            'softplus': nn.Softplus(),
+            'exp': torch.exp,
+            'sigmoid': torch.sigmoid,
+            'tanh': torch.tanh,
+            'none': None
+        }[act_str]
+
+        self.channel = channel
+
+    def forward(self, x):
+        if self.norm_fn is not None:
+            x = self.norm_fn(x)
+        if self.act_fn is not None:
+            x = self.act_fn(x)
+        return x
+
+    def extra_repr(self):
+        return f'kind={self.kind}, channel={self.channel}'
+
+
+class ConvBlock(nn.Module):
+
+    def __init__(self,
+                 inp_dim,
+                 out_dim,
+                 kernel_size=3,
+                 stride=1,
+                 bn=False,
+                 relu=True,
+                 groups=1):
+        super(ConvBlock, self).__init__()
+        self.inp_dim = inp_dim
+        self.conv = nn.Conv2d(
+            inp_dim,
+            out_dim,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2,
+            groups=groups,
+            bias=True)
+        self.relu = None
+        self.bn = None
+        if relu:
+            self.relu = nn.ReLU()
+        if bn:
+            self.bn = nn.BatchNorm2d(out_dim)
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu is not None:
+            x = self.relu(x)
+        return x
+
+
+class ResBlock(nn.Module):
+
+    def __init__(self, inp_dim, out_dim, mid_dim=None):
+        super(ResBlock, self).__init__()
+        if mid_dim is None:
+            mid_dim = out_dim // 2
+        self.relu = nn.ReLU()
+        self.bn1 = nn.BatchNorm2d(inp_dim)
+        self.conv1 = ConvBlock(inp_dim, mid_dim, 1, relu=False)
+        self.bn2 = nn.BatchNorm2d(mid_dim)
+        self.conv2 = ConvBlock(mid_dim, mid_dim, 3, relu=False)
+        self.bn3 = nn.BatchNorm2d(mid_dim)
+        self.conv3 = ConvBlock(mid_dim, out_dim, 1, relu=False)
+        self.skip_layer = ConvBlock(inp_dim, out_dim, 1, relu=False)
+        if inp_dim == out_dim:
+            self.need_skip = False
+        else:
+            self.need_skip = True
+
+    def forward(self, x):
+        if self.need_skip:
+            residual = self.skip_layer(x)
+        else:
+            residual = x
+        out = self.bn1(x)
+        out = self.relu(out)
+        out = self.conv1(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        out = self.bn3(out)
+        out = self.relu(out)
+        out = self.conv3(out)
+        out += residual
+        return out
+
+
+class Hourglass(nn.Module):
+
+    def __init__(self,
+                 n,
+                 f,
+                 increase=0,
+                 up_mode='nearest',
+                 add_coord=False,
+                 first_one=False,
+                 x_dim=64,
+                 y_dim=64):
+        super(Hourglass, self).__init__()
+        nf = f + increase
+
+        Block = ResBlock
+
+        if add_coord:
+            self.coordconv = CoordConvTh(
+                x_dim=x_dim,
+                y_dim=y_dim,
+                with_r=True,
+                with_boundary=True,
+                relu=False,
+                bn=False,
+                in_channels=f,
+                out_channels=f,
+                first_one=first_one,
+                kernel_size=1,
+                stride=1,
+                padding=0)
+        else:
+            self.coordconv = None
+        self.up1 = Block(f, f)
+
+        # Lower branch
+        self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        self.low1 = Block(f, nf)
+        self.n = n
+        # Recursive hourglass
+        if self.n > 1:
+            self.low2 = Hourglass(
+                n=n - 1,
+                f=nf,
+                increase=increase,
+                up_mode=up_mode,
+                add_coord=False)
+        else:
+            self.low2 = Block(nf, nf)
+        self.low3 = Block(nf, f)
+        self.up2 = nn.Upsample(scale_factor=2, mode=up_mode)
+
+    def forward(self, x, heatmap=None):
+        if self.coordconv is not None:
+            x = self.coordconv(x, heatmap)
+        up1 = self.up1(x)
+        pool1 = self.pool1(x)
+        low1 = self.low1(pool1)
+        low2 = self.low2(low1)
+        low3 = self.low3(low2)
+        up2 = self.up2(low3)
+        return up1 + up2
+
+
+class E2HTransform(nn.Module):
+
+    def __init__(self, edge_info, num_points, num_edges):
+        super().__init__()
+
+        e2h_matrix = np.zeros([num_points, num_edges])
+        for edge_id, isclosed_indices in enumerate(edge_info):
+            is_closed, indices = isclosed_indices
+            for point_id in indices:
+                e2h_matrix[point_id, edge_id] = 1
+        e2h_matrix = torch.from_numpy(e2h_matrix).float()
+
+        # pn x en x 1 x 1.
+        self.register_buffer(
+            'weight',
+            e2h_matrix.view(e2h_matrix.size(0), e2h_matrix.size(1), 1, 1))
+
+        # some keypoints are not coverred by any edges,
+        # in these cases, we must add a constant bias to their heatmap weights.
+        bias = ((e2h_matrix @ torch.ones(e2h_matrix.size(1)).to(e2h_matrix))
+                <  # noqa
+                0.5).to(e2h_matrix)  # noqa
+        # pn x 1.
+        self.register_buffer('bias', bias)
+
+    def forward(self, edgemaps):
+        # input: batch_size x en x hw x hh.
+        # output: batch_size x pn x hw x hh.
+        return F.conv2d(edgemaps, weight=self.weight, bias=self.bias)
+
+
+class StackedHGNetV1(nn.Module):
+
+    def __init__(self,
+                 config,
+                 classes_num,
+                 edge_info,
+                 nstack=4,
+                 nlevels=4,
+                 in_channel=256,
+                 increase=0,
+                 add_coord=True,
+                 decoder_type='default'):
+        super(StackedHGNetV1, self).__init__()
+
+        self.cfg = config
+        self.coder_type = decoder_type
+        self.decoder = get_decoder(decoder_type=decoder_type)
+        self.nstack = nstack
+        self.add_coord = add_coord
+
+        self.num_heats = classes_num[0]
+
+        if self.add_coord:
+            convBlock = CoordConvTh(
+                x_dim=self.cfg.width,
+                y_dim=self.cfg.height,
+                with_r=True,
+                with_boundary=False,
+                relu=True,
+                bn=True,
+                in_channels=3,
+                out_channels=64,
+                kernel_size=7,
+                stride=2,
+                padding=3)
+        else:
+            convBlock = ConvBlock(3, 64, 7, 2, bn=True, relu=True)
+
+        pool = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        Block = ResBlock
+
+        self.pre = nn.Sequential(convBlock, Block(64, 128), pool,
+                                 Block(128, 128), Block(128, in_channel))
+
+        self.hgs = nn.ModuleList([
+            Hourglass(
+                n=nlevels,
+                f=in_channel,
+                increase=increase,
+                add_coord=self.add_coord,
+                first_one=(_ == 0),
+                x_dim=int(self.cfg.width / self.nstack),
+                y_dim=int(self.cfg.height / self.nstack))
+            for _ in range(nstack)
+        ])
+
+        self.features = nn.ModuleList([
+            nn.Sequential(
+                Block(in_channel, in_channel),
+                ConvBlock(in_channel, in_channel, 1, bn=True, relu=True))
+            for _ in range(nstack)
+        ])
+
+        self.out_heatmaps = nn.ModuleList([
+            ConvBlock(in_channel, self.num_heats, 1, relu=False, bn=False)
+            for _ in range(nstack)
+        ])
+
+        if self.cfg.use_AAM:
+            self.num_edges = classes_num[1]
+            self.num_points = classes_num[2]
+
+            self.e2h_transform = E2HTransform(edge_info, self.num_points,
+                                              self.num_edges)
+            self.out_edgemaps = nn.ModuleList([
+                ConvBlock(in_channel, self.num_edges, 1, relu=False, bn=False)
+                for _ in range(nstack)
+            ])
+            self.out_pointmaps = nn.ModuleList([
+                ConvBlock(
+                    in_channel, self.num_points, 1, relu=False, bn=False)
+                for _ in range(nstack)
+            ])
+            self.merge_edgemaps = nn.ModuleList([
+                ConvBlock(self.num_edges, in_channel, 1, relu=False, bn=False)
+                for _ in range(nstack - 1)
+            ])
+            self.merge_pointmaps = nn.ModuleList([
+                ConvBlock(
+                    self.num_points, in_channel, 1, relu=False, bn=False)
+                for _ in range(nstack - 1)
+            ])
+            self.edgemap_act = Activation('sigmoid', self.num_edges)
+            self.pointmap_act = Activation('sigmoid', self.num_points)
+
+        self.merge_features = nn.ModuleList([
+            ConvBlock(in_channel, in_channel, 1, relu=False, bn=False)
+            for _ in range(nstack - 1)
+        ])
+        self.merge_heatmaps = nn.ModuleList([
+            ConvBlock(self.num_heats, in_channel, 1, relu=False, bn=False)
+            for _ in range(nstack - 1)
+        ])
+
+        self.nstack = nstack
+
+        self.heatmap_act = Activation('in+relu', self.num_heats)
+
+        self.inference = False
+
+    def set_inference(self, inference):
+        self.inference = inference
+
+    def forward(self, x):
+        x = self.pre(x)
+
+        y, fusionmaps = [], []
+        heatmaps = None
+        for i in range(self.nstack):
+            hg = self.hgs[i](x, heatmap=heatmaps)
+            feature = self.features[i](hg)
+
+            heatmaps0 = self.out_heatmaps[i](feature)
+            heatmaps = self.heatmap_act(heatmaps0)
+
+            if self.cfg.use_AAM:
+                pointmaps0 = self.out_pointmaps[i](feature)
+                pointmaps = self.pointmap_act(pointmaps0)
+                edgemaps0 = self.out_edgemaps[i](feature)
+                edgemaps = self.edgemap_act(edgemaps0)
+                mask = self.e2h_transform(edgemaps) * pointmaps
+                fusion_heatmaps = mask * heatmaps
+            else:
+                fusion_heatmaps = heatmaps
+
+            landmarks = self.decoder.get_coords_from_heatmap(fusion_heatmaps)
+
+            if i < self.nstack - 1:
+                x = x + self.merge_features[i](feature) + \
+                    self.merge_heatmaps[i](heatmaps)
+                if self.cfg.use_AAM:
+                    x += self.merge_pointmaps[i](pointmaps)
+                    x += self.merge_edgemaps[i](edgemaps)
+
+            y.append(landmarks)
+            if self.cfg.use_AAM:
+                y.append(pointmaps)
+                y.append(edgemaps)
+
+            fusionmaps.append(fusion_heatmaps)
+
+        return y, fusionmaps, landmarks
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/__init__.py
new file mode 100644
index 000000000..bede64a74
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/__init__.py
@@ -0,0 +1,5 @@
+from .alignmentDataset import AlignmentDataset
+from .decoder import get_decoder
+from .encoder import get_encoder
+
+__all__ = ['Augmentation', 'AlignmentDataset', 'get_encoder', 'get_decoder']
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/alignmentDataset.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/alignmentDataset.py
new file mode 100644
index 000000000..d0105489a
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/alignmentDataset.py
@@ -0,0 +1,360 @@
+import copy
+import hashlib
+import math
+import os
+import sys
+
+import cv2
+import imageio
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageEnhance, ImageFile
+from scipy import interpolate
+from torch.utils.data import Dataset
+
+from .encoder import get_encoder
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+
+class AlignmentDataset(Dataset):
+
+    def __init__(
+        self,
+        tsv_flie,
+        image_dir='',
+        transform=None,
+        width=256,
+        height=256,
+        channels=3,
+        means=(127.5, 127.5, 127.5),
+        scale=1 / 127.5,
+        classes_num=None,
+        crop_op=True,
+        aug_prob=0.0,
+        edge_info=None,
+        flip_mapping=None,
+        is_train=True,
+        encoder_type='default',
+    ):
+        super(AlignmentDataset, self).__init__()
+        self.use_AAM = True
+        self.encoder_type = encoder_type
+        self.encoder = get_encoder(height, width, encoder_type=encoder_type)
+        self.items = pd.read_csv(tsv_flie, sep='\t')
+        self.image_dir = image_dir
+        self.landmark_num = classes_num[0]
+        self.transform = transform
+
+        self.image_width = width
+        self.image_height = height
+        self.channels = channels
+        assert self.image_width == self.image_height
+
+        self.means = means
+        self.scale = scale
+
+        self.aug_prob = aug_prob
+        self.edge_info = edge_info
+        self.is_train = is_train
+        std_lmk_5pts = np.array([
+            196.0, 226.0, 316.0, 226.0, 256.0, 286.0, 220.0, 360.4, 292.0,
+            360.4
+        ], np.float32) / 256.0 - 1.0
+        std_lmk_5pts = np.reshape(std_lmk_5pts, (5, 2))  # [-1 1]
+        target_face_scale = 1.0 if crop_op else 1.25
+
+        self.augmentation = Augmentation(
+            is_train=self.is_train,
+            aug_prob=self.aug_prob,
+            image_size=self.image_width,
+            crop_op=crop_op,
+            std_lmk_5pts=std_lmk_5pts,
+            target_face_scale=target_face_scale,
+            flip_rate=0.5,
+            flip_mapping=flip_mapping,
+            random_shift_sigma=0.05,
+            random_rot_sigma=math.pi / 180 * 18,
+            random_scale_sigma=0.1,
+            random_gray_rate=0.2,
+            random_occ_rate=0.4,
+            random_blur_rate=0.3,
+            random_gamma_rate=0.2,
+            random_nose_fusion_rate=0.2)
+
+    def _circle(self, img, pt, sigma=1.0, label_type='Gaussian'):
+        # Check that any part of the gaussian is in-bounds
+        tmp_size = sigma * 3
+        ul = [int(pt[0] - tmp_size), int(pt[1] - tmp_size)]
+        br = [int(pt[0] + tmp_size + 1), int(pt[1] + tmp_size + 1)]
+        if (ul[0] > img.shape[1] - 1 or ul[1] > img.shape[0] - 1
+                or br[0] - 1 < 0 or br[1] - 1 < 0):
+            # If not, just return the image as is
+            return img
+
+        # Generate gaussian
+        size = 2 * tmp_size + 1
+        x = np.arange(0, size, 1, np.float32)
+        y = x[:, np.newaxis]
+        x0 = y0 = size // 2
+        # The gaussian is not normalized, we want the center value to equal 1
+        if label_type == 'Gaussian':
+            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+        else:
+            g = sigma / (((x - x0)**2 + (y - y0)**2 + sigma**2)**1.5)
+
+        # Usable gaussian range
+        g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
+        g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
+        # Image range
+        img_x = max(0, ul[0]), min(br[0], img.shape[1])
+        img_y = max(0, ul[1]), min(br[1], img.shape[0])
+
+        img[img_y[0]:img_y[1],
+            img_x[0]:img_x[1]] = 255 * g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+        return img
+
+    def _polylines(self,
+                   img,
+                   lmks,
+                   is_closed,
+                   color=255,
+                   thickness=1,
+                   draw_mode=cv2.LINE_AA,
+                   interpolate_mode=cv2.INTER_AREA,
+                   scale=4):
+        h, w = img.shape
+        img_scale = cv2.resize(
+            img, (w * scale, h * scale), interpolation=interpolate_mode)
+        lmks_scale = (lmks * scale + 0.5).astype(np.int32)
+        cv2.polylines(img_scale, [lmks_scale], is_closed, color,
+                      thickness * scale, draw_mode)
+        img = cv2.resize(img_scale, (w, h), interpolation=interpolate_mode)
+        return img
+
+    def _generate_edgemap(self, points, scale=0.25, thickness=1):
+        h, w = self.image_height, self.image_width
+        edgemaps = []
+        for is_closed, indices in self.edge_info:
+            edgemap = np.zeros([h, w], dtype=np.float32)
+            # align_corners: False.
+            part = copy.deepcopy(points[np.array(indices)])
+
+            part = self._fit_curve(part, is_closed)
+            part[:, 0] = np.clip(part[:, 0], 0, w - 1)
+            part[:, 1] = np.clip(part[:, 1], 0, h - 1)
+            edgemap = self._polylines(edgemap, part, is_closed, 255, thickness)
+
+            edgemaps.append(edgemap)
+        edgemaps = np.stack(edgemaps, axis=0) / 255.0
+        edgemaps = torch.from_numpy(edgemaps).float().unsqueeze(0)
+        edgemaps = F.interpolate(
+            edgemaps,
+            size=(int(w * scale), int(h * scale)),
+            mode='bilinear',
+            align_corners=False).squeeze()
+        return edgemaps
+
+    def _fit_curve(self, lmks, is_closed=False, density=5):
+        try:
+            x = lmks[:, 0].copy()
+            y = lmks[:, 1].copy()
+            if is_closed:
+                x = np.append(x, x[0])
+                y = np.append(y, y[0])
+            tck, u = interpolate.splprep([x, y], s=0, per=is_closed, k=3)
+            # bins = (x.shape[0] - 1) * density + 1
+            # lmk_x, lmk_y = interpolate.splev(np.linspace(0, 1, bins), f)
+            intervals = np.array([])
+            for i in range(len(u) - 1):
+                intervals = np.concatenate(
+                    (intervals,
+                     np.linspace(u[i], u[i + 1], density, endpoint=False)))
+            if not is_closed:
+                intervals = np.concatenate((intervals, [u[-1]]))
+            lmk_x, lmk_y = interpolate.splev(intervals, tck, der=0)
+            # der_x, der_y = interpolate.splev(intervals, tck, der=1)
+            curve_lmks = np.stack([lmk_x, lmk_y], axis=-1)
+            # curve_ders = np.stack([der_x, der_y], axis=-1)
+            # origin_indices = np.arange(0, curve_lmks.shape[0], density)
+
+            return curve_lmks
+        except Exception:
+            return lmks
+
+    def _image_id(self, image_path):
+        if not os.path.exists(image_path):
+            image_path = os.path.join(self.image_dir, image_path)
+        return hashlib.md5(open(image_path, 'rb').read()).hexdigest()
+
+    def _load_image(self, image_path):
+        if not os.path.exists(image_path):
+            image_path = os.path.join(self.image_dir, image_path)
+
+        try:
+            # img = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), cv2.IMREAD_COLOR)#HWC, BGR, [0-255]
+            img = cv2.imread(image_path, cv2.IMREAD_COLOR)  # HWC, BGR, [0-255]
+            assert img is not None and len(
+                img.shape) == 3 and img.shape[2] == 3
+        except Exception:
+            try:
+                img = imageio.imread(image_path)  # HWC, RGB, [0-255]
+                img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)  # HWC, BGR, [0-255]
+                assert img is not None and len(
+                    img.shape) == 3 and img.shape[2] == 3
+            except Exception:
+                try:
+                    gifImg = imageio.mimread(image_path)  # BHWC, RGB, [0-255]
+                    img = gifImg[0]  # HWC, RGB, [0-255]
+                    img = cv2.cvtColor(img,
+                                       cv2.COLOR_RGB2BGR)  # HWC, BGR, [0-255]
+                    assert img is not None and len(
+                        img.shape) == 3 and img.shape[2] == 3
+                except Exception:
+                    img = None
+        return img
+
+    def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center,
+                                  to_center):
+        cosv = math.cos(angle)
+        sinv = math.sin(angle)
+
+        fx, fy = from_center
+        tx, ty = to_center
+
+        acos = scale * cosv
+        asin = scale * sinv
+
+        a0 = acos
+        a1 = -asin
+        a2 = tx - acos * fx + asin * fy + shift_xy[0]
+
+        b0 = asin
+        b1 = acos
+        b2 = ty - asin * fx - acos * fy + shift_xy[1]
+
+        rot_scale_m = np.array([[a0, a1, a2], [b0, b1, b2], [0.0, 0.0, 1.0]],
+                               np.float32)
+        return rot_scale_m
+
+    def _transformPoints2D(self, points, matrix):
+        """
+        points (nx2), matrix (3x3) -> points (nx2)
+        """
+        dtype = points.dtype
+
+        # nx3
+        points = np.concatenate([points, np.ones_like(points[:, [0]])], axis=1)
+        points = points @ np.transpose(matrix)  # nx3
+        points = points[:, :2] / points[:, [2, 2]]
+        return points.astype(dtype)
+
+    def _transformPerspective(self, image, matrix, target_shape):
+        """
+        image, matrix3x3 -> transformed_image
+        """
+        return cv2.warpPerspective(
+            image,
+            matrix,
+            dsize=(target_shape[1], target_shape[0]),
+            flags=cv2.INTER_LINEAR,
+            borderValue=0)
+
+    def _norm_points(self, points, h, w, align_corners=False):
+        if align_corners:
+            # [0, SIZE-1] -> [-1, +1]
+            des_points = points / torch.tensor([w - 1, h - 1]).to(points).view(
+                1, 2) * 2 - 1
+        else:
+            # [-0.5, SIZE-0.5] -> [-1, +1]
+            des_points = (points * 2 + 1) / torch.tensor(
+                [w, h]).to(points).view(1, 2) - 1
+        des_points = torch.clamp(des_points, -1, 1)
+        return des_points
+
+    def _denorm_points(self, points, h, w, align_corners=False):
+        if align_corners:
+            # [-1, +1] -> [0, SIZE-1]
+            des_points = (points + 1) / 2 * torch.tensor(
+                [w - 1, h - 1]).to(points).view(1, 1, 2)
+        else:
+            # [-1, +1] -> [-0.5, SIZE-0.5]
+            des_points = (
+                (points + 1) * torch.tensor([w, h]).to(points).view(1, 1, 2)
+                - 1) / 2
+        return des_points
+
+    def __len__(self):
+        return len(self.items)
+
+    def __getitem__(self, index):
+        sample = dict()
+
+        image_path = self.items.iloc[index, 0]
+        landmarks_5pts = self.items.iloc[index, 1]
+        landmarks_5pts = np.array(
+            list(map(float, landmarks_5pts.split(','))),
+            dtype=np.float32).reshape(5, 2)
+        landmarks_target = self.items.iloc[index, 2]
+        landmarks_target = np.array(
+            list(map(float, landmarks_target.split(','))),
+            dtype=np.float32).reshape(self.landmark_num, 2)
+        scale = float(self.items.iloc[index, 3])
+        center_w, center_h = float(self.items.iloc[index, 4]), float(
+            self.items.iloc[index, 5])
+        if len(self.items.iloc[index]) > 6:
+            tags = np.array(
+                list(
+                    map(lambda x: int(float(x)),
+                        self.items.iloc[index, 6].split(','))))
+        else:
+            tags = np.array([])
+
+        # image & keypoints alignment
+        image_path = image_path.replace('\\', '/')
+        # wflw testset
+        image_path = image_path.replace(
+            '//msr-facestore/Workspace/MSRA_EP_Allergan/users/yanghuan/training_data/wflw/rawImages/',
+            '')
+        # trainset
+        image_path = image_path.replace('./rawImages/', '')
+        image_path = os.path.join(self.image_dir, image_path)
+
+        # image path
+        sample['image_path'] = image_path
+
+        img = self._load_image(image_path)  # HWC, BGR, [0, 255]
+        assert img is not None
+
+        # augmentation
+        # landmarks_target = [-0.5, edge-0.5]
+        img, landmarks_target, matrix = \
+            self.augmentation.process(img, landmarks_target, landmarks_5pts, scale, center_w, center_h)
+
+        landmarks = self._norm_points(
+            torch.from_numpy(landmarks_target), self.image_height,
+            self.image_width)
+
+        sample['label'] = [
+            landmarks,
+        ]
+
+        if self.use_AAM:
+            pointmap = self.encoder.generate_heatmap(landmarks_target)
+            edgemap = self._generate_edgemap(landmarks_target)
+            sample['label'] += [pointmap, edgemap]
+
+        sample['matrix'] = matrix
+
+        # image normalization
+        img = img.transpose(2, 0, 1).astype(np.float32)  # CHW, BGR, [0, 255]
+        img[0, :, :] = (img[0, :, :] - self.means[0]) * self.scale
+        img[1, :, :] = (img[1, :, :] - self.means[1]) * self.scale
+        img[2, :, :] = (img[2, :, :] - self.means[2]) * self.scale
+        sample['data'] = torch.from_numpy(img)  # CHW, BGR, [-1, 1]
+
+        sample['tags'] = tags
+
+        return sample
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/__init__.py
new file mode 100644
index 000000000..9acc9bcb5
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/__init__.py
@@ -0,0 +1,9 @@
+from .decoder_default import decoder_default
+
+
+def get_decoder(decoder_type='default'):
+    if decoder_type == 'default':
+        decoder = decoder_default()
+    else:
+        raise NotImplementedError
+    return decoder
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/decoder_default.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/decoder_default.py
new file mode 100644
index 000000000..4e1c7c70c
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/decoder_default.py
@@ -0,0 +1,39 @@
+import torch
+
+
+class decoder_default:
+
+    def __init__(self, weight=1, use_weight_map=False):
+        self.weight = weight
+        self.use_weight_map = use_weight_map
+
+    def _make_grid(self, h, w):
+        yy, xx = torch.meshgrid(
+            torch.arange(h).float() / (h - 1) * 2 - 1,
+            torch.arange(w).float() / (w - 1) * 2 - 1)
+        return yy, xx
+
+    def get_coords_from_heatmap(self, heatmap):
+        """
+            inputs:
+            - heatmap: batch x npoints x h x w
+
+            outputs:
+            - coords: batch x npoints x 2 (x,y), [-1, +1]
+            - radius_sq: batch x npoints
+        """
+        batch, npoints, h, w = heatmap.shape
+        if self.use_weight_map:
+            heatmap = heatmap * self.weight
+
+        yy, xx = self._make_grid(h, w)
+        yy = yy.view(1, 1, h, w).to(heatmap)
+        xx = xx.view(1, 1, h, w).to(heatmap)
+
+        heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6)
+
+        yy_coord = (yy * heatmap).sum([2, 3]) / heatmap_sum  # batch x npoints
+        xx_coord = (xx * heatmap).sum([2, 3]) / heatmap_sum  # batch x npoints
+        coords = torch.stack([xx_coord, yy_coord], dim=-1)
+
+        return coords
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/__init__.py
new file mode 100644
index 000000000..60af50821
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/__init__.py
@@ -0,0 +1,13 @@
+from .encoder_default import encoder_default
+
+
+def get_encoder(image_height,
+                image_width,
+                scale=0.25,
+                sigma=1.5,
+                encoder_type='default'):
+    if encoder_type == 'default':
+        encoder = encoder_default(image_height, image_width, scale, sigma)
+    else:
+        raise NotImplementedError
+    return encoder
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/encoder_default.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/encoder_default.py
new file mode 100644
index 000000000..8bff79421
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/encoder_default.py
@@ -0,0 +1,68 @@
+import copy
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+class encoder_default:
+
+    def __init__(self, image_height, image_width, scale=0.25, sigma=1.5):
+        self.image_height = image_height
+        self.image_width = image_width
+        self.scale = scale
+        self.sigma = sigma
+
+    def generate_heatmap(self, points):
+        # points = (num_pts, 2)
+        h, w = self.image_height, self.image_width
+        pointmaps = []
+        for i in range(len(points)):
+            pointmap = np.zeros([h, w], dtype=np.float32)
+            # align_corners: False.
+            point = copy.deepcopy(points[i])
+            point[0] = max(0, min(w - 1, point[0]))
+            point[1] = max(0, min(h - 1, point[1]))
+            pointmap = self._circle(pointmap, point, sigma=self.sigma)
+
+            pointmaps.append(pointmap)
+        pointmaps = np.stack(pointmaps, axis=0) / 255.0
+        pointmaps = torch.from_numpy(pointmaps).float().unsqueeze(0)
+        pointmaps = F.interpolate(
+            pointmaps,
+            size=(int(w * self.scale), int(h * self.scale)),
+            mode='bilinear',
+            align_corners=False).squeeze()
+        return pointmaps
+
+    def _circle(self, img, pt, sigma=1.0, label_type='Gaussian'):
+        # Check that any part of the gaussian is in-bounds
+        tmp_size = sigma * 3
+        ul = [int(pt[0] - tmp_size), int(pt[1] - tmp_size)]
+        br = [int(pt[0] + tmp_size + 1), int(pt[1] + tmp_size + 1)]
+        if (ul[0] > img.shape[1] - 1 or ul[1] > img.shape[0] - 1
+                or br[0] - 1 < 0 or br[1] - 1 < 0):
+            # If not, just return the image as is
+            return img
+
+        # Generate gaussian
+        size = 2 * tmp_size + 1
+        x = np.arange(0, size, 1, np.float32)
+        y = x[:, np.newaxis]
+        x0 = y0 = size // 2
+        # The gaussian is not normalized, we want the center value to equal 1
+        if label_type == 'Gaussian':
+            g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2))
+        else:
+            g = sigma / (((x - x0)**2 + (y - y0)**2 + sigma**2)**1.5)
+
+        # Usable gaussian range
+        g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0]
+        g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1]
+        # Image range
+        img_x = max(0, ul[0]), min(br[0], img.shape[1])
+        img_y = max(0, ul[1]), min(br[1], img.shape[0])
+
+        img[img_y[0]:img_y[1],
+            img_x[0]:img_x[1]] = 255 * g[g_y[0]:g_y[1], g_x[0]:g_x[1]]
+        return img
diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/utility.py b/modelscope/models/cv/facial_68ldk_detection/lib/utility.py
new file mode 100644
index 000000000..2e195761b
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/lib/utility.py
@@ -0,0 +1,54 @@
+import os.path as osp
+import time
+
+import json
+import numpy as np
+import torch
+
+from ..conf import *
+from .backbone import StackedHGNetV1
+
+
+def get_config(args):
+    config = None
+    config_name = args.config_name
+    if config_name == 'alignment':
+        config = Alignment(args)
+    else:
+        assert NotImplementedError
+
+    return config
+
+
+def get_net(config):
+    net = None
+    if config.net == 'stackedHGnet_v1':
+        net = StackedHGNetV1(
+            config=config,
+            classes_num=config.classes_num,
+            edge_info=config.edge_info,
+            nstack=config.nstack,
+            add_coord=config.add_coord,
+            decoder_type=config.decoder_type)
+    else:
+        assert False
+    return net
+
+
+def set_environment(config):
+    if config.device_id >= 0:
+        assert torch.cuda.is_available(
+        ) and torch.cuda.device_count() > config.device_id
+        torch.cuda.empty_cache()
+        config.device = torch.device('cuda', config.device_id)
+        config.use_gpu = True
+    else:
+        config.device = torch.device('cpu')
+        config.use_gpu = False
+
+    torch.set_default_dtype(torch.float32)
+    torch.set_default_tensor_type(torch.FloatTensor)
+    torch.set_flush_denormal(True)  # ignore extremely small value
+    torch.backends.cudnn.benchmark = True
+    # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware.
+    torch.autograd.set_detect_anomaly(True)
diff --git a/modelscope/models/cv/facial_68ldk_detection/star_model.py b/modelscope/models/cv/facial_68ldk_detection/star_model.py
new file mode 100644
index 000000000..c0d37ef10
--- /dev/null
+++ b/modelscope/models/cv/facial_68ldk_detection/star_model.py
@@ -0,0 +1,35 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.facial_68ldk_detection import infer
+from modelscope.outputs import OutputKeys
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.facial_68ldk_detection, module_name=Models.star_68ldk_detection)
+class FaceLandmarkDetection(TorchModel):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+
+    def forward(self, Inputs):
+        return Inputs
+
+    def postprocess(self, Inputs):
+        return Inputs
+
+    def inference(self, data):
+        return data
diff --git a/modelscope/models/cv/head_reconstruction/models/headrecon_model.py b/modelscope/models/cv/head_reconstruction/models/headrecon_model.py
index e515421c1..a3d5cb6f9 100644
--- a/modelscope/models/cv/head_reconstruction/models/headrecon_model.py
+++ b/modelscope/models/cv/head_reconstruction/models/headrecon_model.py
@@ -109,7 +109,7 @@ def __init__(self, model_dir, *args, **kwargs):
         ]
 
         self.compute_feat_loss = perceptual_loss
-        self.comupte_color_loss = photo_loss
+        self.compute_color_loss = photo_loss
         self.compute_lm_loss = landmark_loss
         self.compute_reg_loss = reg_loss
         self.compute_reflc_loss = reflectance_loss
@@ -519,7 +519,7 @@ def get_edge_points_horizontal(self):
     def compute_losses_fitting(self):
         face_mask = self.pred_mask
         face_mask = face_mask.detach()
-        self.loss_color = self.opt.w_color * self.comupte_color_loss(
+        self.loss_color = self.opt.w_color * self.compute_color_loss(
             self.pred_face, self.input_img, face_mask)  # 1.0
 
         loss_reg, loss_gamma = self.compute_reg_loss(
@@ -552,7 +552,7 @@ def compute_losses_fitting(self):
 
         head_mask = self.pred_mask_head
         head_mask = head_mask.detach()
-        self.loss_color_head = self.opt.w_color * self.comupte_color_loss(
+        self.loss_color_head = self.opt.w_color * self.compute_color_loss(
             self.pred_head, self.input_img, head_mask)  # 1.0
         self.loss_smooth_offset_head = TVLoss()(
             self.shape_offset_uv_head.permute(0, 3, 1, 2)) * 100  # 10000
diff --git a/modelscope/models/cv/head_reconstruction/models/losses.py b/modelscope/models/cv/head_reconstruction/models/losses.py
index 6d4af4e8d..e170112d9 100644
--- a/modelscope/models/cv/head_reconstruction/models/losses.py
+++ b/modelscope/models/cv/head_reconstruction/models/losses.py
@@ -49,7 +49,7 @@ def perceptual_loss(id_featureA, id_featureB):
 # image level loss
 def photo_loss(imageA, imageB, mask, eps=1e-6):
     """
-    l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur)
+    l2 norm (with sqrt, to ensure backward stability, use eps, otherwise Nan may occur)
     Parameters:
         imageA       --torch.tensor (B, 3, H, W), range (0, 1), RGB order
         imageB       --same as imageA
diff --git a/modelscope/models/cv/human3d_animation/generate_skeleton.py b/modelscope/models/cv/human3d_animation/generate_skeleton.py
index 556cdbd37..6543c8485 100644
--- a/modelscope/models/cv/human3d_animation/generate_skeleton.py
+++ b/modelscope/models/cv/human3d_animation/generate_skeleton.py
@@ -9,7 +9,7 @@
 from .utils import matrix_to_axis_angle, rotation_6d_to_matrix
 
 
-def laod_smpl_params(pose_fname):
+def load_smpl_params(pose_fname):
     with open(pose_fname, 'rb') as f:
         data = pickle.load(f)
         pose = torch.from_numpy(data['pose'])
@@ -132,7 +132,7 @@ def gen_skeleton_bvh(model_dir, action_dir, case_dir, action, mode='move'):
     device = torch.device('cpu')
     assets_dir = os.path.join(model_dir, '3D-assets')
     pkl_path = os.path.join(assets_dir, 'smpl.pkl')
-    poses, shapes, trans, joints = laod_smpl_params(pkl_path)
+    poses, shapes, trans, joints = load_smpl_params(pkl_path)
     if action.endswith('.npy'):
         skeleton_path = os.path.join(assets_dir, 'skeleton_nohand.npy')
     else:
diff --git a/modelscope/models/cv/human_image_generation/human_image_generation_infer.py b/modelscope/models/cv/human_image_generation/human_image_generation_infer.py
index 0781d8930..420ce786a 100644
--- a/modelscope/models/cv/human_image_generation/human_image_generation_infer.py
+++ b/modelscope/models/cv/human_image_generation/human_image_generation_infer.py
@@ -148,7 +148,7 @@ def forward(self, x, y, z):
         return pred_result
 
 
-def trans_keypoins(keypoints, param, img_size, offset=None):
+def trans_keypoints(keypoints, param, img_size, offset=None):
     missing_keypoint_index = keypoints == -1
 
     # crop the white line in the original dataset
@@ -194,7 +194,7 @@ def get_label_tensor(path, img, param):
               [255, 0, 170], [255, 0, 85]]
     canvas = np.zeros((img.shape[1], img.shape[2], 3)).astype(np.uint8)
     keypoint = np.loadtxt(path)
-    keypoint, normalized_kp = trans_keypoins(keypoint, param, img.shape[1:])
+    keypoint, normalized_kp = trans_keypoints(keypoint, param, img.shape[1:])
     stickwidth = 4
     for i in range(18):
         x, y = keypoint[i, 0:2]
diff --git a/modelscope/models/cv/human_normal_estimation/__init__.py b/modelscope/models/cv/human_normal_estimation/__init__.py
new file mode 100644
index 000000000..f176c6bfc
--- /dev/null
+++ b/modelscope/models/cv/human_normal_estimation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .human_nnet import HumanNormalEstimation
+
+else:
+    _import_structure = {
+        'human_nnet': ['HumanNormalEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/human_normal_estimation/human_nnet.py b/modelscope/models/cv/human_normal_estimation/human_nnet.py
new file mode 100644
index 000000000..6621c8d3d
--- /dev/null
+++ b/modelscope/models/cv/human_normal_estimation/human_nnet.py
@@ -0,0 +1,80 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import numpy as np
+import torch
+import torchvision.transforms as T
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.human_normal_estimation.networks import config, nnet
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import Tasks
+
+
+@MODELS.register_module(
+    Tasks.human_normal_estimation, module_name=Models.human_normal_estimation)
+class HumanNormalEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        super().__init__(model_dir, **kwargs)
+        config_file = os.path.join(model_dir, 'config.txt')
+        args = config.get_args(txt_file=config_file)
+        args.encoder_path = os.path.join(model_dir, args.encoder_path)
+
+        self.device = torch.device(
+            'cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+        self.nnet = nnet.NormalNet(args=args).to(self.device)
+        self.nnet_path = os.path.join(model_dir, 'ckpt/best_nnet.pt')
+        if os.path.exists(self.nnet_path):
+            ckpt = torch.load(
+                self.nnet_path, map_location=self.device)['model']
+            load_dict = {}
+            for k, v in ckpt.items():
+                if k.startswith('module.'):
+                    k_ = k.replace('module.', '')
+                    load_dict[k_] = v
+                else:
+                    load_dict[k] = v
+            self.nnet.load_state_dict(load_dict)
+        self.nnet.eval()
+
+        self.normalize = T.Normalize(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+
+    def forward(self, inputs):
+        img = inputs['img'].astype(np.float32) / 255.0
+        msk = inputs['msk'].astype(np.float32) / 255.0
+        bbox = inputs['bbox']
+
+        img_h, img_w = img.shape[0:2]
+        img = torch.from_numpy(img).permute(2, 0,
+                                            1).unsqueeze(0).to(self.device)
+        img = self.normalize(img)
+
+        fx = fy = (max(img_h, img_h) / 2.0) / np.tan(np.deg2rad(60.0 / 2.0))
+        cx = (img_h / 2.0) - 0.5
+        cy = (img_w / 2.0) - 0.5
+
+        intrins = torch.tensor(
+            [[fx, 0, cx + 0.5], [0, fy, cy + 0.5], [0, 0, 1]],
+            dtype=torch.float32,
+            device=self.device).unsqueeze(0)
+
+        pred_norm = self.nnet(img, intrins=intrins)[-1]
+        pred_norm = pred_norm.detach().cpu().permute(0, 2, 3, 1).numpy()
+        pred_norm = pred_norm[0, ...]
+        pred_norm = pred_norm * msk[..., None]
+        pred_norm = pred_norm[bbox[1]:bbox[3], bbox[0]:bbox[2]]
+        results = pred_norm
+        return results
+
+    def postprocess(self, inputs):
+        normal_result = inputs
+        results = {OutputKeys.NORMALS: normal_result}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+        return results
diff --git a/modelscope/models/cv/human_normal_estimation/networks/__init__.py b/modelscope/models/cv/human_normal_estimation/networks/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/human_normal_estimation/networks/config.py b/modelscope/models/cv/human_normal_estimation/networks/config.py
new file mode 100644
index 000000000..1a4883091
--- /dev/null
+++ b/modelscope/models/cv/human_normal_estimation/networks/config.py
@@ -0,0 +1,40 @@
+import argparse
+
+
+def convert_arg_line_to_args(arg_line):
+    for arg in arg_line.split():
+        if not arg.strip():
+            continue
+        yield str(arg)
+
+
+def get_args(txt_file=None):
+    parser = argparse.ArgumentParser(
+        fromfile_prefix_chars='@', conflict_handler='resolve')
+    parser.convert_arg_line_to_args = convert_arg_line_to_args
+
+    # checkpoint (only needed when testing the model)
+    parser.add_argument('--ckpt_path', type=str, default=None)
+    parser.add_argument('--encoder_path', type=str, default=None)
+
+    # ↓↓↓↓
+    # NOTE: project-specific args
+    parser.add_argument('--output_dim', type=int, default=3, help='{3, 4}')
+    parser.add_argument('--output_type', type=str, default='R', help='{R, G}')
+    parser.add_argument('--feature_dim', type=int, default=64)
+    parser.add_argument('--hidden_dim', type=int, default=64)
+
+    parser.add_argument('--encoder_B', type=int, default=5)
+
+    parser.add_argument('--decoder_NF', type=int, default=2048)
+    parser.add_argument('--decoder_BN', default=False, action='store_true')
+    parser.add_argument('--decoder_down', type=int, default=2)
+    parser.add_argument(
+        '--learned_upsampling', default=False, action='store_true')
+
+    # read arguments from txt file
+    if txt_file:
+        config_filename = '@' + txt_file
+
+    args = parser.parse_args([config_filename])
+    return args
diff --git a/modelscope/models/cv/human_normal_estimation/networks/nnet.py b/modelscope/models/cv/human_normal_estimation/networks/nnet.py
new file mode 100644
index 000000000..e10e97c90
--- /dev/null
+++ b/modelscope/models/cv/human_normal_estimation/networks/nnet.py
@@ -0,0 +1,125 @@
+import os
+import sys
+
+import torch
+import torch.nn as nn
+
+from .submodules import (Encoder, UpSampleBN, UpSampleGN, get_pixel_coords,
+                         get_prediction_head, normal_activation,
+                         upsample_via_bilinear, upsample_via_mask)
+
+PROJECT_DIR = os.path.split(os.path.dirname(os.path.realpath(__file__)))[0]
+sys.path.append(PROJECT_DIR)
+
+
+class NormalNet(nn.Module):
+
+    def __init__(self, args):
+        super(NormalNet, self).__init__()
+        B = args.encoder_B
+        NF = args.decoder_NF
+        BN = args.decoder_BN
+        learned_upsampling = args.learned_upsampling
+
+        self.encoder = Encoder(B=B, pretrained=False, ckpt=args.encoder_path)
+        self.decoder = Decoder(
+            num_classes=args.output_dim,
+            B=B,
+            NF=NF,
+            BN=BN,
+            learned_upsampling=learned_upsampling)
+
+    def forward(self, x, **kwargs):
+        return self.decoder(self.encoder(x), **kwargs)
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 num_classes=3,
+                 B=5,
+                 NF=2048,
+                 BN=False,
+                 learned_upsampling=True):
+        super(Decoder, self).__init__()
+        input_channels = [2048, 176, 64, 40, 24]
+
+        UpSample = UpSampleBN if BN else UpSampleGN
+        features = NF
+
+        self.conv2 = nn.Conv2d(
+            input_channels[0] + 2,
+            features,
+            kernel_size=1,
+            stride=1,
+            padding=0)
+        self.up1 = UpSample(
+            skip_input=features // 1 + input_channels[1] + 2,
+            output_features=features // 2,
+            align_corners=False)
+        self.up2 = UpSample(
+            skip_input=features // 2 + input_channels[2] + 2,
+            output_features=features // 4,
+            align_corners=False)
+        self.up3 = UpSample(
+            skip_input=features // 4 + input_channels[3] + 2,
+            output_features=features // 8,
+            align_corners=False)
+        self.up4 = UpSample(
+            skip_input=features // 8 + input_channels[4] + 2,
+            output_features=features // 16,
+            align_corners=False)
+        i_dim = features // 16
+
+        self.downsample_ratio = 2
+        self.output_dim = num_classes
+
+        self.pred_head = get_prediction_head(i_dim + 2, 128, num_classes)
+        if learned_upsampling:
+            self.mask_head = get_prediction_head(
+                i_dim + 2, 128,
+                9 * self.downsample_ratio * self.downsample_ratio)
+            self.upsample_fn = upsample_via_mask
+        else:
+            self.mask_head = lambda a: None
+            self.upsample_fn = upsample_via_bilinear
+
+        self.pixel_coords = get_pixel_coords(h=1024, w=1024).to(0)
+
+    def ray_embedding(self, x, intrins, orig_H, orig_W):
+        B, _, H, W = x.shape
+        fu = intrins[:, 0, 0].unsqueeze(-1).unsqueeze(-1) * (W / orig_W)
+        cu = intrins[:, 0, 2].unsqueeze(-1).unsqueeze(-1) * (W / orig_W)
+        fv = intrins[:, 1, 1].unsqueeze(-1).unsqueeze(-1) * (H / orig_H)
+        cv = intrins[:, 1, 2].unsqueeze(-1).unsqueeze(-1) * (H / orig_H)
+
+        uv = self.pixel_coords[:, :2, :H, :W].repeat(B, 1, 1, 1)
+        uv[:, 0, :, :] = (uv[:, 0, :, :] - cu) / fu
+        uv[:, 1, :, :] = (uv[:, 1, :, :] - cv) / fv
+        return torch.cat([x, uv], dim=1)
+
+    def forward(self, features, intrins):
+        x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], \
+            features[8], features[11]
+        _, _, orig_H, orig_W = features[0].shape
+
+        x_d0 = self.conv2(
+            self.ray_embedding(x_block4, intrins, orig_H, orig_W))
+        x_d1 = self.up1(x_d0,
+                        self.ray_embedding(x_block3, intrins, orig_H, orig_W))
+        x_d2 = self.up2(x_d1,
+                        self.ray_embedding(x_block2, intrins, orig_H, orig_W))
+        x_d3 = self.up3(x_d2,
+                        self.ray_embedding(x_block1, intrins, orig_H, orig_W))
+        x_feat = self.up4(
+            x_d3, self.ray_embedding(x_block0, intrins, orig_H, orig_W))
+
+        out = self.pred_head(
+            self.ray_embedding(x_feat, intrins, orig_H, orig_W))
+        out = normal_activation(out, elu_kappa=True)
+        mask = self.mask_head(
+            self.ray_embedding(x_feat, intrins, orig_H, orig_W))
+        up_out = self.upsample_fn(
+            out, up_mask=mask, downsample_ratio=self.downsample_ratio)
+        up_out = normal_activation(up_out, elu_kappa=False)
+        return [up_out]
diff --git a/modelscope/models/cv/human_normal_estimation/networks/submodules.py b/modelscope/models/cv/human_normal_estimation/networks/submodules.py
new file mode 100644
index 000000000..32fbd0116
--- /dev/null
+++ b/modelscope/models/cv/human_normal_estimation/networks/submodules.py
@@ -0,0 +1,214 @@
+import geffnet
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+INPUT_CHANNELS_DICT = {
+    0: [1280, 112, 40, 24, 16],
+    1: [1280, 112, 40, 24, 16],
+    2: [1408, 120, 48, 24, 16],
+    3: [1536, 136, 48, 32, 24],
+    4: [1792, 160, 56, 32, 24],
+    5: [2048, 176, 64, 40, 24],
+    6: [2304, 200, 72, 40, 32],
+    7: [2560, 224, 80, 48, 32]
+}
+
+
+class Encoder(nn.Module):
+
+    def __init__(self, B=5, pretrained=True, ckpt=None):
+        super(Encoder, self).__init__()
+        if ckpt:
+            basemodel = geffnet.create_model(
+                'tf_efficientnet_b%s_ap' % B,
+                pretrained=pretrained,
+                checkpoint_path=ckpt)
+        else:
+            basemodel = geffnet.create_model(
+                'tf_efficientnet_b%s_ap' % B, pretrained=pretrained)
+
+        basemodel.global_pool = nn.Identity()
+        basemodel.classifier = nn.Identity()
+        self.original_model = basemodel
+
+    def forward(self, x):
+        features = [x]
+        for k, v in self.original_model._modules.items():
+            if k == 'blocks':
+                for ki, vi in v._modules.items():
+                    features.append(vi(features[-1]))
+            else:
+                features.append(v(features[-1]))
+        return features
+
+
+class ConvGRU(nn.Module):
+
+    def __init__(self, hidden_dim, input_dim, ks=3):
+        super().__init__()
+        p = (ks - 1) // 2
+        self.convz = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, ks, padding=p)
+        self.convr = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, ks, padding=p)
+        self.convq = nn.Conv2d(
+            hidden_dim + input_dim, hidden_dim, ks, padding=p)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1)))
+        h = (1 - z) * h + z * q
+        return h
+
+
+class UpSampleBN(nn.Module):
+
+    def __init__(self, skip_input, output_features, align_corners=True):
+        super(UpSampleBN, self).__init__()
+        self._net = nn.Sequential(
+            nn.Conv2d(
+                skip_input,
+                output_features,
+                kernel_size=3,
+                stride=1,
+                padding=1), nn.BatchNorm2d(output_features), nn.LeakyReLU(),
+            nn.Conv2d(
+                output_features,
+                output_features,
+                kernel_size=3,
+                stride=1,
+                padding=1), nn.BatchNorm2d(output_features), nn.LeakyReLU())
+        self.align_corners = align_corners
+
+    def forward(self, x, concat_with):
+        up_x = F.interpolate(
+            x,
+            size=[concat_with.size(2),
+                  concat_with.size(3)],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        f = torch.cat([up_x, concat_with], dim=1)
+        return self._net(f)
+
+
+class Conv2d_WS(nn.Conv2d):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True):
+        super(Conv2d_WS,
+              self).__init__(in_channels, out_channels, kernel_size, stride,
+                             padding, dilation, groups, bias)
+
+    def forward(self, x):
+        weight = self.weight
+        weight_mean = weight.mean(
+            dim=1, keepdim=True).mean(
+                dim=2, keepdim=True).mean(
+                    dim=3, keepdim=True)
+        weight = weight - weight_mean
+        std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1,
+                                                              1) + 1e-5
+        weight = weight / std.expand_as(weight)
+        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+
+class UpSampleGN(nn.Module):
+
+    def __init__(self, skip_input, output_features, align_corners=True):
+        super(UpSampleGN, self).__init__()
+        self._net = nn.Sequential(
+            Conv2d_WS(
+                skip_input,
+                output_features,
+                kernel_size=3,
+                stride=1,
+                padding=1), nn.GroupNorm(8, output_features), nn.LeakyReLU(),
+            Conv2d_WS(
+                output_features,
+                output_features,
+                kernel_size=3,
+                stride=1,
+                padding=1), nn.GroupNorm(8, output_features), nn.LeakyReLU())
+        self.align_corners = align_corners
+
+    def forward(self, x, concat_with):
+        up_x = F.interpolate(
+            x,
+            size=[concat_with.size(2),
+                  concat_with.size(3)],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        f = torch.cat([up_x, concat_with], dim=1)
+        return self._net(f)
+
+
+def upsample_via_bilinear(out, up_mask=None, downsample_ratio=None):
+    return F.interpolate(
+        out,
+        scale_factor=downsample_ratio,
+        mode='bilinear',
+        align_corners=False)
+
+
+def upsample_via_mask(out, up_mask, downsample_ratio, padding='zero'):
+    """
+    convex upsampling
+    """
+    # out: low-resolution output (B, o_dim, H, W)
+    # up_mask: (B, 9*k*k, H, W)
+    k = downsample_ratio
+
+    B, C, H, W = out.shape
+    up_mask = up_mask.view(B, 1, 9, k, k, H, W)
+    up_mask = torch.softmax(up_mask, dim=2)  # (B, 1, 9, k, k, H, W)
+
+    if padding == 'zero':
+        up_out = F.unfold(out, [3, 3], padding=1)
+    elif padding == 'replicate':
+        out = F.pad(out, pad=(1, 1, 1, 1), mode='replicate')
+        up_out = F.unfold(out, [3, 3], padding=0)
+    else:
+        raise Exception('invalid padding for convex upsampling')
+
+    up_out = up_out.view(B, C, 9, 1, 1, H, W)
+
+    up_out = torch.sum(up_mask * up_out, dim=2)
+    up_out = up_out.permute(0, 1, 4, 2, 5, 3)
+    return up_out.reshape(B, C, k * H, k * W)
+
+
+def get_prediction_head(input_dim, hidden_dim, output_dim):
+    return nn.Sequential(
+        nn.Conv2d(input_dim, hidden_dim, 3, padding=1), nn.ReLU(inplace=True),
+        nn.Conv2d(hidden_dim, hidden_dim, 1), nn.ReLU(inplace=True),
+        nn.Conv2d(hidden_dim, output_dim, 1))
+
+
+# submodules copy from DSINE
+def get_pixel_coords(h, w):
+    pixel_coords = np.ones((3, h, w)).astype(np.float32)
+    x_range = np.concatenate([np.arange(w).reshape(1, w)] * h, axis=0)
+    y_range = np.concatenate([np.arange(h).reshape(h, 1)] * w, axis=1)
+    pixel_coords[0, :, :] = x_range + 0.5
+    pixel_coords[1, :, :] = y_range + 0.5
+    return torch.from_numpy(pixel_coords).unsqueeze(0)
+
+
+def normal_activation(out, elu_kappa=True):
+    normal, kappa = out[:, :3, :, :], out[:, 3:, :, :]
+    normal = F.normalize(normal, p=2, dim=1)
+    if elu_kappa:
+        kappa = F.elu(kappa) + 1.0
+    return torch.cat([normal, kappa], dim=1)
diff --git a/modelscope/models/cv/human_reconstruction/models/detectors.py b/modelscope/models/cv/human_reconstruction/models/detectors.py
index 4f63dd8c7..0fc41ab9e 100644
--- a/modelscope/models/cv/human_reconstruction/models/detectors.py
+++ b/modelscope/models/cv/human_reconstruction/models/detectors.py
@@ -1,4 +1,4 @@
-# The implementation here is modified based on Pytorch, originally BSD License and publicly avaialbe at
+# The implementation here is modified based on Pytorch, originally BSD License and publicly available at
 # https://github.com/pytorch/pytorch
 import numpy as np
 import torch
diff --git a/modelscope/models/cv/human_reconstruction/models/geometry.py b/modelscope/models/cv/human_reconstruction/models/geometry.py
index fa4a00a6b..43ef6da6c 100644
--- a/modelscope/models/cv/human_reconstruction/models/geometry.py
+++ b/modelscope/models/cv/human_reconstruction/models/geometry.py
@@ -1,4 +1,4 @@
-# The implementation here is modified based on PIFU, originally MIT License and publicly avaialbe at
+# The implementation here is modified based on PIFU, originally MIT License and publicly available at
 # https://github.com/shunsukesaito/PIFu/blob/master/lib/geometry.py
 import torch
 
@@ -44,7 +44,7 @@ def perspective(points, calib, transform=None):
     args:
         points: [B, 3, N] 3d points in world coordinates
         calib: [B, 3, 4] projection matrix
-        transform: [B, 2, 3] screen space trasnformation
+        transform: [B, 2, 3] screen space transformation
     return:
         [B, 3, N] 3d coordinates in screen space
     """
diff --git a/modelscope/models/cv/human_reconstruction/models/networks.py b/modelscope/models/cv/human_reconstruction/models/networks.py
index 266237b6b..1ef8c801e 100644
--- a/modelscope/models/cv/human_reconstruction/models/networks.py
+++ b/modelscope/models/cv/human_reconstruction/models/networks.py
@@ -1,4 +1,4 @@
-# The implementation here is modified based on Pix2PixHD, originally BSD License and publicly avaialbe at
+# The implementation here is modified based on Pix2PixHD, originally BSD License and publicly available at
 # https://github.com/NVIDIA/pix2pixHD
 import functools
 
diff --git a/modelscope/models/cv/image_body_reshaping/person_info.py b/modelscope/models/cv/image_body_reshaping/person_info.py
index 509a2ce30..d205ae9ec 100644
--- a/modelscope/models/cv/image_body_reshaping/person_info.py
+++ b/modelscope/models/cv/image_body_reshaping/person_info.py
@@ -15,7 +15,7 @@ class PersonInfo(object):
     def __init__(self, joints):
         self.joints = joints
         self.flow = None
-        self.pad_boder = False
+        self.pad_border = False
         self.height_expand = 0
         self.width_expand = 0
         self.coeff = 0.2
@@ -24,11 +24,11 @@ def __init__(self, joints):
         self.divider = 20
         self.flow_scales = ['upper_2']
 
-    def update_attribute(self, pad_boder, height_expand, width_expand):
-        self.pad_boder = pad_boder
+    def update_attribute(self, pad_border, height_expand, width_expand):
+        self.pad_border = pad_border
         self.height_expand = height_expand
         self.width_expand = width_expand
-        if pad_boder:
+        if pad_border:
             self.joints[:, 0] += width_expand
             self.joints[:, 1] += height_expand
 
@@ -41,7 +41,7 @@ def pred_flow(self, img, flow_net, device):
             if len(img.shape) == 2:
                 img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
 
-            if self.pad_boder:
+            if self.pad_border:
                 height_expand = self.height_expand
                 width_expand = self.width_expand
                 pad_img = cv2.copyMakeBorder(
diff --git a/modelscope/models/cv/image_body_reshaping/slim_utils.py b/modelscope/models/cv/image_body_reshaping/slim_utils.py
index 23d5a741f..4ee0a6120 100644
--- a/modelscope/models/cv/image_body_reshaping/slim_utils.py
+++ b/modelscope/models/cv/image_body_reshaping/slim_utils.py
@@ -439,10 +439,10 @@ def get_heatmap_cv(img, magn, max_flow_mag):
     return cv_out
 
 
-def save_heatmap_cv(img, flow, supression=2):
+def save_heatmap_cv(img, flow, suppression=2):
 
     flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2)
-    flow_magn -= supression
+    flow_magn -= suppression
     flow_magn[flow_magn <= 0] = 0
     cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3)
     return cv_out
diff --git a/modelscope/models/cv/image_classification/backbones/beit_v2.py b/modelscope/models/cv/image_classification/backbones/beit_v2.py
index eda117279..a567eada8 100644
--- a/modelscope/models/cv/image_classification/backbones/beit_v2.py
+++ b/modelscope/models/cv/image_classification/backbones/beit_v2.py
@@ -41,7 +41,7 @@ def forward(self, x):
         x = self.fc1(x)
         x = self.act(x)
         # x = self.drop(x)
-        # commit this for the orignal BERT implement
+        # commit this for the original BERT implement
         x = self.fc2(x)
         x = self.drop(x)
         return x
diff --git a/modelscope/models/cv/image_color_enhance/adaint/adaint.py b/modelscope/models/cv/image_color_enhance/adaint/adaint.py
index 8839f03a9..6977cb5a9 100644
--- a/modelscope/models/cv/image_color_enhance/adaint/adaint.py
+++ b/modelscope/models/cv/image_color_enhance/adaint/adaint.py
@@ -92,7 +92,7 @@ class Res18Backbone(nn.Module):
     r"""The ResNet-18 backbone.
 
     Args:
-        pretrained (bool, optional): Whether to use the torchvison pretrained weights.
+        pretrained (bool, optional): Whether to use the torchvision pretrained weights.
             Default: True.
         input_resolution (int, optional): Resolution for pre-downsampling. Default: 224.
         extra_pooling (bool, optional): [ignore].
@@ -312,7 +312,7 @@ def init_weights(self):
         and bias, respectively.
         """
 
-        def special_initilization(m):
+        def special_initialization(m):
             classname = m.__class__.__name__
             if 'Conv' in classname:
                 nn.init.xavier_normal_(m.weight.data)
@@ -321,7 +321,7 @@ def special_initilization(m):
                 nn.init.constant_(m.bias.data, 0.0)
 
         if self.backbone_name not in ['res18']:
-            self.apply(special_initilization)
+            self.apply(special_initialization)
         self.lut_generator.init_weights()
         if self.en_adaint:
             self.adaint.init_weights()
diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
index bc118ff21..de279d1c0 100644
--- a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
+++ b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py
@@ -56,7 +56,7 @@ def is_torch_version_available():
 `pip install torch==1.11`
 """
 
-REQUIREMENTS_MAAPING_VERSION = OrderedDict([
+REQUIREMENTS_MAPPING_VERSION = OrderedDict([
     ('detectron2-0.3', (is_detectron2_version_available,
                         DETECTRON2_IMPORT_ERROR)),
     ('torch-1.11', (is_torch_version_available, TORCH_VERSION_IMPORT_ERROR)),
@@ -68,8 +68,8 @@ def is_torch_version_available():
 def requires_version():
     checks = []
     for req in REQUIREMENTS:
-        if req in REQUIREMENTS_MAAPING_VERSION:
-            check = REQUIREMENTS_MAAPING_VERSION[req]
+        if req in REQUIREMENTS_MAPPING_VERSION:
+            check = REQUIREMENTS_MAPPING_VERSION[req]
         else:
             raise NotImplementedError('{} do not supported check'.format(req))
         checks.append(check)
diff --git a/modelscope/models/cv/image_depth_estimation_marigold/__init__.py b/modelscope/models/cv/image_depth_estimation_marigold/__init__.py
new file mode 100644
index 000000000..15e4c01eb
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_marigold/__init__.py
@@ -0,0 +1,28 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .marigold import MarigoldDepthOutput
+    from .marigold_utils import (chw2hwc, colorize_depth_maps, ensemble_depths,
+                                 find_batch_size, inter_distances,
+                                 resize_max_res)
+else:
+    _import_structure = {
+        'marigold': ['MarigoldDepthOutput'],
+        'marigold_utils': [
+            'find_batch_size', 'inter_distances', 'ensemble_depths',
+            'colorize_depth_maps', 'chw2hwc', 'resize_max_res'
+        ]
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_depth_estimation_marigold/marigold.py b/modelscope/models/cv/image_depth_estimation_marigold/marigold.py
new file mode 100644
index 000000000..a597b68c0
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_marigold/marigold.py
@@ -0,0 +1,42 @@
+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+# --------------------------------------------------------------------------
+
+from typing import Dict, Union
+
+import numpy as np
+from diffusers.utils import BaseOutput
+from PIL import Image
+
+
+class MarigoldDepthOutput(BaseOutput):
+    """
+    Output class for Marigold monocular depth prediction pipeline.
+
+    Args:
+        depth_np (`np.ndarray`):
+            Predicted depth map, with depth values in the range of [0, 1].
+        depth_colored (`PIL.Image.Image`):
+            Colorized depth map, with the shape of [3, H, W] and values in [0, 1].
+        uncertainty (`None` or `np.ndarray`):
+            Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling.
+    """
+
+    depth_np: np.ndarray
+    depth_colored: Image.Image
+    uncertainty: Union[None, np.ndarray]
diff --git a/modelscope/models/cv/image_depth_estimation_marigold/marigold_utils.py b/modelscope/models/cv/image_depth_estimation_marigold/marigold_utils.py
new file mode 100644
index 000000000..00bceafe0
--- /dev/null
+++ b/modelscope/models/cv/image_depth_estimation_marigold/marigold_utils.py
@@ -0,0 +1,364 @@
+# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# --------------------------------------------------------------------------
+# If you find this code useful, we kindly ask you to cite our paper in your work.
+# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation
+# More information about the method can be found at https://marigoldmonodepth.github.io
+
+import math
+
+import matplotlib
+import numpy as np
+import torch
+from PIL import Image
+from scipy.optimize import minimize
+
+# Search table for suggested max. inference batch size
+bs_search_table = [
+    # tested on A100-PCIE-80GB
+    {
+        'res': 768,
+        'total_vram': 79,
+        'bs': 35,
+        'dtype': torch.float32
+    },
+    {
+        'res': 1024,
+        'total_vram': 79,
+        'bs': 20,
+        'dtype': torch.float32
+    },
+    # tested on A100-PCIE-40GB
+    {
+        'res': 768,
+        'total_vram': 39,
+        'bs': 15,
+        'dtype': torch.float32
+    },
+    {
+        'res': 1024,
+        'total_vram': 39,
+        'bs': 8,
+        'dtype': torch.float32
+    },
+    {
+        'res': 768,
+        'total_vram': 39,
+        'bs': 30,
+        'dtype': torch.float16
+    },
+    {
+        'res': 1024,
+        'total_vram': 39,
+        'bs': 15,
+        'dtype': torch.float16
+    },
+    # tested on RTX3090, RTX4090
+    {
+        'res': 512,
+        'total_vram': 23,
+        'bs': 20,
+        'dtype': torch.float32
+    },
+    {
+        'res': 768,
+        'total_vram': 23,
+        'bs': 7,
+        'dtype': torch.float32
+    },
+    {
+        'res': 1024,
+        'total_vram': 23,
+        'bs': 3,
+        'dtype': torch.float32
+    },
+    {
+        'res': 512,
+        'total_vram': 23,
+        'bs': 40,
+        'dtype': torch.float16
+    },
+    {
+        'res': 768,
+        'total_vram': 23,
+        'bs': 18,
+        'dtype': torch.float16
+    },
+    {
+        'res': 1024,
+        'total_vram': 23,
+        'bs': 10,
+        'dtype': torch.float16
+    },
+    # tested on GTX1080Ti
+    {
+        'res': 512,
+        'total_vram': 10,
+        'bs': 5,
+        'dtype': torch.float32
+    },
+    {
+        'res': 768,
+        'total_vram': 10,
+        'bs': 2,
+        'dtype': torch.float32
+    },
+    {
+        'res': 512,
+        'total_vram': 10,
+        'bs': 10,
+        'dtype': torch.float16
+    },
+    {
+        'res': 768,
+        'total_vram': 10,
+        'bs': 5,
+        'dtype': torch.float16
+    },
+    {
+        'res': 1024,
+        'total_vram': 10,
+        'bs': 3,
+        'dtype': torch.float16
+    },
+]
+
+
+def find_batch_size(ensemble_size: int, input_res: int,
+                    dtype: torch.dtype) -> int:
+    """
+    Automatically search for suitable operating batch size.
+
+    Args:
+        ensemble_size (`int`):
+            Number of predictions to be ensembled.
+        input_res (`int`):
+            Operating resolution of the input image.
+
+    Returns:
+        `int`: Operating batch size.
+    """
+    if not torch.cuda.is_available():
+        return 1
+
+    total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3
+    filtered_bs_search_table = [
+        s for s in bs_search_table if s['dtype'] == dtype
+    ]
+    for settings in sorted(
+            filtered_bs_search_table,
+            key=lambda k: (k['res'], -k['total_vram']),
+    ):
+        if input_res <= settings['res'] and total_vram >= settings[
+                'total_vram']:
+            bs = settings['bs']
+            if bs > ensemble_size:
+                bs = ensemble_size
+            elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size:
+                bs = math.ceil(ensemble_size / 2)
+            return bs
+
+    return 1
+
+
+def inter_distances(tensors: torch.Tensor):
+    """
+    To calculate the distance between each two depth maps.
+    """
+    distances = []
+    for i, j in torch.combinations(torch.arange(tensors.shape[0])):
+        arr1 = tensors[i:i + 1]
+        arr2 = tensors[j:j + 1]
+        distances.append(arr1 - arr2)
+    dist = torch.concatenate(distances, dim=0)
+    return dist
+
+
+def ensemble_depths(
+    input_images: torch.Tensor,
+    regularizer_strength: float = 0.02,
+    max_iter: int = 2,
+    tol: float = 1e-3,
+    reduction: str = 'median',
+    max_res: int = None,
+):
+    """
+    To ensemble multiple affine-invariant depth images (up to scale and shift),
+        by aligning estimating the scale and shift
+    """
+    device = input_images.device
+    dtype = input_images.dtype
+    np_dtype = np.float32
+
+    original_input = input_images.clone()
+    n_img = input_images.shape[0]
+    ori_shape = input_images.shape
+
+    if max_res is not None:
+        scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:]))
+        if scale_factor < 1:
+            downscaler = torch.nn.Upsample(
+                scale_factor=scale_factor, mode='nearest')
+            input_images = downscaler(torch.from_numpy(input_images)).numpy()
+
+    # init guess
+    _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+    _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1)
+    s_init = 1.0 / (_max - _min).reshape((-1, 1, 1))
+    t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1))
+    x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype)
+
+    input_images = input_images.to(device)
+
+    # objective function
+    def closure(x):
+        length = len(x)
+        s = x[:int(length / 2)]
+        t = x[int(length / 2):]
+        s = torch.from_numpy(s).to(dtype=dtype).to(device)
+        t = torch.from_numpy(t).to(dtype=dtype).to(device)
+
+        transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view(
+            (-1, 1, 1))
+        dists = inter_distances(transformed_arrays)
+        sqrt_dist = torch.sqrt(torch.mean(dists**2))
+
+        if 'mean' == reduction:
+            pred = torch.mean(transformed_arrays, dim=0)
+        elif 'median' == reduction:
+            pred = torch.median(transformed_arrays, dim=0).values
+        else:
+            raise ValueError
+
+        near_err = torch.sqrt((0 - torch.min(pred))**2)
+        far_err = torch.sqrt((1 - torch.max(pred))**2)
+
+        err = sqrt_dist + (near_err + far_err) * regularizer_strength
+        err = err.detach().cpu().numpy().astype(np_dtype)
+        return err
+
+    res = minimize(
+        closure,
+        x,
+        method='BFGS',
+        tol=tol,
+        options={
+            'maxiter': max_iter,
+            'disp': False
+        })
+    x = res.x
+    length = len(x)
+    s = x[:int(length / 2)]
+    t = x[int(length / 2):]
+
+    # Prediction
+    s = torch.from_numpy(s).to(dtype=dtype).to(device)
+    t = torch.from_numpy(t).to(dtype=dtype).to(device)
+    transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1)
+    if 'mean' == reduction:
+        aligned_images = torch.mean(transformed_arrays, dim=0)
+        std = torch.std(transformed_arrays, dim=0)
+        uncertainty = std
+    elif 'median' == reduction:
+        aligned_images = torch.median(transformed_arrays, dim=0).values
+        # MAD (median absolute deviation) as uncertainty indicator
+        abs_dev = torch.abs(transformed_arrays - aligned_images)
+        mad = torch.median(abs_dev, dim=0).values
+        uncertainty = mad
+    else:
+        raise ValueError(f'Unknown reduction method: {reduction}')
+
+    # Scale and shift to [0, 1]
+    _min = torch.min(aligned_images)
+    _max = torch.max(aligned_images)
+    aligned_images = (aligned_images - _min) / (_max - _min)
+    uncertainty /= _max - _min
+
+    return aligned_images, uncertainty
+
+
+def colorize_depth_maps(depth_map,
+                        min_depth,
+                        max_depth,
+                        cmap='Spectral',
+                        valid_mask=None):
+    """
+    Colorize depth maps.
+    """
+    assert len(depth_map.shape) >= 2, 'Invalid dimension'
+
+    if isinstance(depth_map, torch.Tensor):
+        depth = depth_map.detach().clone().squeeze().numpy()
+    elif isinstance(depth_map, np.ndarray):
+        depth = depth_map.copy().squeeze()
+    # reshape to [ (B,) H, W ]
+    if depth.ndim < 3:
+        depth = depth[np.newaxis, :, :]
+
+    # colorize
+    cm = matplotlib.colormaps[cmap]
+    depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1)
+    img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3]  # value from 0 to 1
+    img_colored_np = np.rollaxis(img_colored_np, 3, 1)
+
+    if valid_mask is not None:
+        if isinstance(depth_map, torch.Tensor):
+            valid_mask = valid_mask.detach().numpy()
+        valid_mask = valid_mask.squeeze()  # [H, W] or [B, H, W]
+        if valid_mask.ndim < 3:
+            valid_mask = valid_mask[np.newaxis, np.newaxis, :, :]
+        else:
+            valid_mask = valid_mask[:, np.newaxis, :, :]
+        valid_mask = np.repeat(valid_mask, 3, axis=1)
+        img_colored_np[~valid_mask] = 0
+
+    if isinstance(depth_map, torch.Tensor):
+        img_colored = torch.from_numpy(img_colored_np).float()
+    elif isinstance(depth_map, np.ndarray):
+        img_colored = img_colored_np
+
+    return img_colored
+
+
+def chw2hwc(chw):
+    assert 3 == len(chw.shape)
+    if isinstance(chw, torch.Tensor):
+        hwc = torch.permute(chw, (1, 2, 0))
+    elif isinstance(chw, np.ndarray):
+        hwc = np.moveaxis(chw, 0, -1)
+    return hwc
+
+
+def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image:
+    """
+    Resize image to limit maximum edge length while keeping aspect ratio.
+
+    Args:
+        img (`Image.Image`):
+            Image to be resized.
+        max_edge_resolution (`int`):
+            Maximum edge length (pixel).
+
+    Returns:
+        `Image.Image`: Resized image.
+    """
+    original_width, original_height = img.size
+    downscale_factor = min(max_edge_resolution / original_width,
+                           max_edge_resolution / original_height)
+
+    new_width = int(original_width * downscale_factor)
+    new_height = int(original_height * downscale_factor)
+
+    resized_img = img.resize((new_width, new_height))
+    return resized_img
diff --git a/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py b/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py
index e29ad2b9e..9aa0dc053 100644
--- a/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py
+++ b/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py
@@ -22,7 +22,7 @@
     Tasks.image_driving_perception, module_name=Models.yolopv2)
 class YOLOPv2(TorchModel):
     """ YOLOPv2 use E-ELAN which first adopted in Yolov7 as backbone, SPP+FPN+PAN as neck and head.
-    For more infomation, please refer to https://arxiv.org/pdf/2208.11434.pdf
+    For more information, please refer to https://arxiv.org/pdf/2208.11434.pdf
     """
 
     def __init__(self, model_dir: str, *args, **kwargs):
diff --git a/modelscope/models/cv/image_driving_perception/preprocessor.py b/modelscope/models/cv/image_driving_perception/preprocessor.py
index 3e0e476fd..2bb84eb3a 100644
--- a/modelscope/models/cv/image_driving_perception/preprocessor.py
+++ b/modelscope/models/cv/image_driving_perception/preprocessor.py
@@ -92,7 +92,7 @@ def __call__(
         Args:
             data (str): image path
         Returns:
-            Dict[ndarry, Any]: the preprocessed data
+            Dict[ndarray, Any]: the preprocessed data
             {
                 "img": the preprocessed resized image (640x640)
             }
diff --git a/modelscope/models/cv/image_editing/__init__.py b/modelscope/models/cv/image_editing/__init__.py
index 35341a189..8b77bd0ac 100644
--- a/modelscope/models/cv/image_editing/__init__.py
+++ b/modelscope/models/cv/image_editing/__init__.py
@@ -5,11 +5,11 @@
 
 if TYPE_CHECKING:
     from .masactrl import MutualSelfAttentionControl
-    from .masactrl_utils import regiter_attention_editor_diffusers
+    from .masactrl_utils import register_attention_editor_diffusers
 else:
     _import_structure = {
         'masactrl': ['MutualSelfAttentionControl'],
-        'masactrl_utils': ['regiter_attention_editor_diffusers']
+        'masactrl_utils': ['register_attention_editor_diffusers']
     }
 
     import sys
diff --git a/modelscope/models/cv/image_editing/masactrl_utils.py b/modelscope/models/cv/image_editing/masactrl_utils.py
index a59e987f6..b74ff13f6 100644
--- a/modelscope/models/cv/image_editing/masactrl_utils.py
+++ b/modelscope/models/cv/image_editing/masactrl_utils.py
@@ -41,7 +41,7 @@ def reset(self):
         self.cur_att_layer = 0
 
 
-def regiter_attention_editor_diffusers(model, editor: AttentionBase):
+def register_attention_editor_diffusers(model, editor: AttentionBase):
     """
     Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt]
     """
diff --git a/modelscope/models/cv/image_local_feature_matching/__init__.py b/modelscope/models/cv/image_local_feature_matching/__init__.py
new file mode 100644
index 000000000..eecc611ec
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .loftr_model import LocalFeatureMatching
+
+else:
+    _import_structure = {
+        'loftr_image_local_feature_matching': ['LocalFeatureMatching'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_local_feature_matching/loftr_model.py b/modelscope/models/cv/image_local_feature_matching/loftr_model.py
new file mode 100644
index 000000000..d47b9da2a
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/loftr_model.py
@@ -0,0 +1,79 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import io
+import os.path as osp
+from copy import deepcopy
+
+import cv2
+import matplotlib.cm as cm
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_local_feature_matching.src.loftr import (
+    LoFTR, default_cfg)
+from modelscope.models.cv.image_local_feature_matching.src.utils.plotting import \
+    make_matching_figure
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_local_feature_matching,
+    module_name=Models.loftr_image_local_feature_matching)
+class LocalFeatureMatching(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        # build model
+        # Initialize LoFTR
+        _default_cfg = deepcopy(default_cfg)
+        self.model = LoFTR(config=_default_cfg)
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = torch.load(model_path, map_location='cpu')
+        self.model.load_state_dict(checkpoint['state_dict'])
+        self.model.eval()
+
+    def forward(self, Inputs):
+        self.model(Inputs)
+        result = {
+            'kpts0': Inputs['mkpts0_f'],
+            'kpts1': Inputs['mkpts1_f'],
+            'conf': Inputs['mconf'],
+        }
+        Inputs.update(result)
+        return Inputs
+
+    def postprocess(self, Inputs):
+        # Draw
+        color = cm.jet(Inputs['conf'].cpu().numpy())
+        img0, img1, mkpts0, mkpts1 = Inputs['image0'].squeeze().cpu().numpy(
+        ), Inputs['image1'].squeeze().cpu().numpy(), Inputs['kpts0'].cpu(
+        ).numpy(), Inputs['kpts1'].cpu().numpy()
+        text = [
+            'LoFTR',
+            'Matches: {}'.format(len(Inputs['kpts0'])),
+        ]
+        img0, img1 = (img0 * 255).astype(np.uint8), (img1 * 255).astype(
+            np.uint8)
+        fig = make_matching_figure(
+            img0, img1, mkpts0, mkpts1, color, text=text)
+        io_buf = io.BytesIO()
+        fig.savefig(io_buf, format='png', dpi=75)
+        io_buf.seek(0)
+        buf_data = np.frombuffer(io_buf.getvalue(), dtype=np.uint8)
+        io_buf.close()
+        vis_img = cv2.imdecode(buf_data, 1)
+
+        results = {OutputKeys.MATCHES: Inputs, OutputKeys.OUTPUT_IMG: vis_img}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+
+        return results
diff --git a/modelscope/models/cv/image_local_feature_matching/src/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/__init__.py
new file mode 100644
index 000000000..0d69b9c13
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/__init__.py
@@ -0,0 +1,2 @@
+from .loftr import LoFTR
+from .utils.cvpr_ds_config import default_cfg
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py
new file mode 100644
index 000000000..af4f526dd
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py
@@ -0,0 +1,12 @@
+from .resnet_fpn import ResNetFPN_8_2, ResNetFPN_16_4
+
+
+def build_backbone(config):
+    if config['backbone_type'] == 'ResNetFPN':
+        if config['resolution'] == (8, 2):
+            return ResNetFPN_8_2(config['resnetfpn'])
+        elif config['resolution'] == (16, 4):
+            return ResNetFPN_16_4(config['resnetfpn'])
+    else:
+        raise ValueError(
+            f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.")
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py
new file mode 100644
index 000000000..ea7583d18
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py
@@ -0,0 +1,219 @@
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution without padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=1,
+        stride=stride,
+        padding=0,
+        bias=False)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+
+    def __init__(self, in_planes, planes, stride=1):
+        super().__init__()
+        self.conv1 = conv3x3(in_planes, planes, stride)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+
+        if stride == 1:
+            self.downsample = None
+        else:
+            self.downsample = nn.Sequential(
+                conv1x1(in_planes, planes, stride=stride),
+                nn.BatchNorm2d(planes))
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.bn1(self.conv1(y)))
+        y = self.bn2(self.conv2(y))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class ResNetFPN_8_2(nn.Module):
+    """
+    ResNet+FPN, output resolution are 1/8 and 1/2.
+    Each block has 2 layers.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        # Config
+        block = BasicBlock
+        initial_dim = config['initial_dim']
+        block_dims = config['block_dims']
+
+        # Class Variable
+        self.in_planes = initial_dim
+
+        # Networks
+        self.conv1 = nn.Conv2d(
+            1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(initial_dim)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.layer1 = self._make_layer(block, block_dims[0], stride=1)  # 1/2
+        self.layer2 = self._make_layer(block, block_dims[1], stride=2)  # 1/4
+        self.layer3 = self._make_layer(block, block_dims[2], stride=2)  # 1/8
+
+        # 3. FPN upsample
+        self.layer3_outconv = conv1x1(block_dims[2], block_dims[2])
+        self.layer2_outconv = conv1x1(block_dims[1], block_dims[2])
+        self.layer2_outconv2 = nn.Sequential(
+            conv3x3(block_dims[2], block_dims[2]),
+            nn.BatchNorm2d(block_dims[2]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[2], block_dims[1]),
+        )
+        self.layer1_outconv = conv1x1(block_dims[0], block_dims[1])
+        self.layer1_outconv2 = nn.Sequential(
+            conv3x3(block_dims[1], block_dims[1]),
+            nn.BatchNorm2d(block_dims[1]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[1], block_dims[0]),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, dim, stride=1):
+        layer1 = block(self.in_planes, dim, stride=stride)
+        layer2 = block(dim, dim, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # ResNet Backbone
+        x0 = self.relu(self.bn1(self.conv1(x)))
+        x1 = self.layer1(x0)  # 1/2
+        x2 = self.layer2(x1)  # 1/4
+        x3 = self.layer3(x2)  # 1/8
+
+        # FPN
+        x3_out = self.layer3_outconv(x3)
+
+        x3_out_2x = F.interpolate(
+            x3_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x2_out = self.layer2_outconv(x2)
+        x2_out = self.layer2_outconv2(x2_out + x3_out_2x)
+
+        x2_out_2x = F.interpolate(
+            x2_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x1_out = self.layer1_outconv(x1)
+        x1_out = self.layer1_outconv2(x1_out + x2_out_2x)
+
+        return [x3_out, x1_out]
+
+
+class ResNetFPN_16_4(nn.Module):
+    """
+    ResNet+FPN, output resolution are 1/16 and 1/4.
+    Each block has 2 layers.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        # Config
+        block = BasicBlock
+        initial_dim = config['initial_dim']
+        block_dims = config['block_dims']
+
+        # Class Variable
+        self.in_planes = initial_dim
+
+        # Networks
+        self.conv1 = nn.Conv2d(
+            1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(initial_dim)
+        self.relu = nn.ReLU(inplace=True)
+
+        self.layer1 = self._make_layer(block, block_dims[0], stride=1)  # 1/2
+        self.layer2 = self._make_layer(block, block_dims[1], stride=2)  # 1/4
+        self.layer3 = self._make_layer(block, block_dims[2], stride=2)  # 1/8
+        self.layer4 = self._make_layer(block, block_dims[3], stride=2)  # 1/16
+
+        # 3. FPN upsample
+        self.layer4_outconv = conv1x1(block_dims[3], block_dims[3])
+        self.layer3_outconv = conv1x1(block_dims[2], block_dims[3])
+        self.layer3_outconv2 = nn.Sequential(
+            conv3x3(block_dims[3], block_dims[3]),
+            nn.BatchNorm2d(block_dims[3]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[3], block_dims[2]),
+        )
+
+        self.layer2_outconv = conv1x1(block_dims[1], block_dims[2])
+        self.layer2_outconv2 = nn.Sequential(
+            conv3x3(block_dims[2], block_dims[2]),
+            nn.BatchNorm2d(block_dims[2]),
+            nn.LeakyReLU(),
+            conv3x3(block_dims[2], block_dims[1]),
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, block, dim, stride=1):
+        layer1 = block(self.in_planes, dim, stride=stride)
+        layer2 = block(dim, dim, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        # ResNet Backbone
+        x0 = self.relu(self.bn1(self.conv1(x)))
+        x1 = self.layer1(x0)  # 1/2
+        x2 = self.layer2(x1)  # 1/4
+        x3 = self.layer3(x2)  # 1/8
+        x4 = self.layer4(x3)  # 1/16
+
+        # FPN
+        x4_out = self.layer4_outconv(x4)
+
+        x4_out_2x = F.interpolate(
+            x4_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x3_out = self.layer3_outconv(x3)
+        x3_out = self.layer3_outconv2(x3_out + x4_out_2x)
+
+        x3_out_2x = F.interpolate(
+            x3_out, scale_factor=2., mode='bilinear', align_corners=True)
+        x2_out = self.layer2_outconv(x2)
+        x2_out = self.layer2_outconv2(x2_out + x3_out_2x)
+
+        return [x4_out, x2_out]
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py
new file mode 100644
index 000000000..34cac8879
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py
@@ -0,0 +1,93 @@
+import torch
+import torch.nn as nn
+from einops.einops import rearrange
+
+from .backbone import build_backbone
+from .loftr_module import FinePreprocess, LocalFeatureTransformer
+from .utils.coarse_matching import CoarseMatching
+from .utils.fine_matching import FineMatching
+from .utils.position_encoding import PositionEncodingSine
+
+
+class LoFTR(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        # Misc
+        self.config = config
+
+        # Modules
+        self.backbone = build_backbone(config)
+        self.pos_encoding = PositionEncodingSine(
+            config['coarse']['d_model'],
+            temp_bug_fix=config['coarse']['temp_bug_fix'])
+        self.loftr_coarse = LocalFeatureTransformer(config['coarse'])
+        self.coarse_matching = CoarseMatching(config['match_coarse'])
+        self.fine_preprocess = FinePreprocess(config)
+        self.loftr_fine = LocalFeatureTransformer(config['fine'])
+        self.fine_matching = FineMatching()
+
+    def forward(self, data):
+        """
+        Update:
+            data (dict): {
+                'image0': (torch.Tensor): (N, 1, H, W)
+                'image1': (torch.Tensor): (N, 1, H, W)
+                'mask0'(optional) : (torch.Tensor): (N, H, W) '0' indicates a padded position
+                'mask1'(optional) : (torch.Tensor): (N, H, W)
+            }
+        """
+        # 1. Local Feature CNN
+        data.update({
+            'bs': data['image0'].size(0),
+            'hw0_i': data['image0'].shape[2:],
+            'hw1_i': data['image1'].shape[2:]
+        })
+
+        if data['hw0_i'] == data['hw1_i']:  # faster & better BN convergence
+            feats_c, feats_f = self.backbone(
+                torch.cat([data['image0'], data['image1']], dim=0))
+            (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split(
+                data['bs']), feats_f.split(data['bs'])
+        else:  # handle different input shapes
+            (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone(
+                data['image0']), self.backbone(data['image1'])
+
+        data.update({
+            'hw0_c': feat_c0.shape[2:],
+            'hw1_c': feat_c1.shape[2:],
+            'hw0_f': feat_f0.shape[2:],
+            'hw1_f': feat_f1.shape[2:]
+        })
+
+        # 2. coarse-level loftr module
+        # add featmap with positional encoding, then flatten it to sequence [N, HW, C]
+        feat_c0 = rearrange(self.pos_encoding(feat_c0), 'n c h w -> n (h w) c')
+        feat_c1 = rearrange(self.pos_encoding(feat_c1), 'n c h w -> n (h w) c')
+
+        mask_c0 = mask_c1 = None  # mask is useful in training
+        if 'mask0' in data:
+            mask_c0, mask_c1 = data['mask0'].flatten(
+                -2), data['mask1'].flatten(-2)
+        feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0,
+                                             mask_c1)
+
+        # 3. match coarse-level
+        self.coarse_matching(
+            feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1)
+
+        # 4. fine-level refinement
+        feat_f0_unfold, feat_f1_unfold = self.fine_preprocess(
+            feat_f0, feat_f1, feat_c0, feat_c1, data)
+        if feat_f0_unfold.size(0) != 0:  # at least one coarse level predicted
+            feat_f0_unfold, feat_f1_unfold = self.loftr_fine(
+                feat_f0_unfold, feat_f1_unfold)
+
+        # 5. match fine-level
+        self.fine_matching(feat_f0_unfold, feat_f1_unfold, data)
+
+    def load_state_dict(self, state_dict, *args, **kwargs):
+        for k in list(state_dict.keys()):
+            if k.startswith('matcher.'):
+                state_dict[k.replace('matcher.', '', 1)] = state_dict.pop(k)
+        return super().load_state_dict(state_dict, *args, **kwargs)
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py
new file mode 100644
index 000000000..8d83af7e9
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py
@@ -0,0 +1,2 @@
+from .fine_preprocess import FinePreprocess
+from .transformer import LocalFeatureTransformer
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py
new file mode 100644
index 000000000..8624eab5e
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py
@@ -0,0 +1,73 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.einops import rearrange, repeat
+
+
+class FinePreprocess(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.cat_c_feat = config['fine_concat_coarse_feat']
+        self.W = self.config['fine_window_size']
+
+        d_model_c = self.config['coarse']['d_model']
+        d_model_f = self.config['fine']['d_model']
+        self.d_model_f = d_model_f
+        if self.cat_c_feat:
+            self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True)
+            self.merge_feat = nn.Linear(2 * d_model_f, d_model_f, bias=True)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.kaiming_normal_(p, mode='fan_out', nonlinearity='relu')
+
+    def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data):
+        W = self.W
+        stride = data['hw0_f'][0] // data['hw0_c'][0]
+
+        data.update({'W': W})
+        if data['b_ids'].shape[0] == 0:
+            feat0 = torch.empty(
+                0, self.W**2, self.d_model_f, device=feat_f0.device)
+            feat1 = torch.empty(
+                0, self.W**2, self.d_model_f, device=feat_f0.device)
+            return feat0, feat1
+
+        # 1. unfold(crop) all local windows
+        feat_f0_unfold = F.unfold(
+            feat_f0, kernel_size=(W, W), stride=stride, padding=W // 2)
+        feat_f0_unfold = rearrange(
+            feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
+        feat_f1_unfold = F.unfold(
+            feat_f1, kernel_size=(W, W), stride=stride, padding=W // 2)
+        feat_f1_unfold = rearrange(
+            feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2)
+
+        # 2. select only the predicted matches
+        feat_f0_unfold = feat_f0_unfold[data['b_ids'],
+                                        data['i_ids']]  # [n, ww, cf]
+        feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']]
+
+        # option: use coarse-level loftr feature as context: concat and linear
+        if self.cat_c_feat:
+            feat_c_win = self.down_proj(
+                torch.cat([
+                    feat_c0[data['b_ids'], data['i_ids']],
+                    feat_c1[data['b_ids'], data['j_ids']]
+                ], 0))  # [2n, c]
+            feat_cf_win = self.merge_feat(
+                torch.cat(
+                    [
+                        torch.cat([feat_f0_unfold, feat_f1_unfold],
+                                  0),  # [2n, ww, cf]
+                        repeat(feat_c_win, 'n c -> n ww c', ww = W ** 2),  # [2n, ww, cf]
+                    ], -1))  # yapf: disable
+            feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0)
+
+        return feat_f0_unfold, feat_f1_unfold
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py
new file mode 100644
index 000000000..8e4f11d1d
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py
@@ -0,0 +1,86 @@
+"""
+Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention"
+Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py
+"""
+
+import torch
+from torch.nn import Dropout, Module
+
+
+def elu_feature_map(x):
+    return torch.nn.functional.elu(x) + 1
+
+
+class LinearAttention(Module):
+
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.feature_map = elu_feature_map
+        self.eps = eps
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-Head linear attention proposed in "Transformers are RNNs"
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        Q = self.feature_map(queries)
+        K = self.feature_map(keys)
+
+        # set padded position to zero
+        if q_mask is not None:
+            Q = Q * q_mask[:, :, None, None]
+        if kv_mask is not None:
+            K = K * kv_mask[:, :, None, None]
+            values = values * kv_mask[:, :, None, None]
+
+        v_length = values.size(1)
+        values = values / v_length  # prevent fp16 overflow
+        KV = torch.einsum('nshd,nshv->nhdv', K, values)  # (S,D)' @ S,V
+        Z = 1 / (torch.einsum('nlhd,nhd->nlh', Q, K.sum(dim=1)) + self.eps)
+        queried_values = torch.einsum('nlhd,nhdv,nlh->nlhv', Q, KV,
+                                      Z) * v_length
+
+        return queried_values.contiguous()
+
+
+class FullAttention(Module):
+
+    def __init__(self, use_dropout=False, attention_dropout=0.1):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.dropout = Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """ Multi-head scaled dot-product attention, a.k.a full attention.
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum('nlhd,nshd->nlsh', queries, keys)
+        if kv_mask is not None:
+            QK.masked_fill_(
+                ~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]),
+                float('-inf'))
+
+        # Compute the attention and the weighted average
+        softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
+        A = torch.softmax(softmax_temp * QK, dim=2)
+        if self.use_dropout:
+            A = self.dropout(A)
+
+        queried_values = torch.einsum('nlsh,nshd->nlhd', A, values)
+
+        return queried_values.contiguous()
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py
new file mode 100644
index 000000000..4c28f20d7
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py
@@ -0,0 +1,111 @@
+import copy
+
+import torch
+import torch.nn as nn
+
+from .linear_attention import FullAttention, LinearAttention
+
+
+class LoFTREncoderLayer(nn.Module):
+
+    def __init__(self, d_model, nhead, attention='linear'):
+        super(LoFTREncoderLayer, self).__init__()
+
+        self.dim = d_model // nhead
+        self.nhead = nhead
+
+        # multi-head attention
+        self.q_proj = nn.Linear(d_model, d_model, bias=False)
+        self.k_proj = nn.Linear(d_model, d_model, bias=False)
+        self.v_proj = nn.Linear(d_model, d_model, bias=False)
+        self.attention = LinearAttention(
+        ) if attention == 'linear' else FullAttention()
+        self.merge = nn.Linear(d_model, d_model, bias=False)
+
+        # feed-forward network
+        self.mlp = nn.Sequential(
+            nn.Linear(d_model * 2, d_model * 2, bias=False),
+            nn.ReLU(True),
+            nn.Linear(d_model * 2, d_model, bias=False),
+        )
+
+        # norm and dropout
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    def forward(self, x, source, x_mask=None, source_mask=None):
+        """
+        Args:
+            x (torch.Tensor): [N, L, C]
+            source (torch.Tensor): [N, S, C]
+            x_mask (torch.Tensor): [N, L] (optional)
+            source_mask (torch.Tensor): [N, S] (optional)
+        """
+        bs = x.size(0)
+        query, key, value = x, source, source
+
+        # multi-head attention
+        query = self.q_proj(query).view(bs, -1, self.nhead,
+                                        self.dim)  # [N, L, (H, D)]
+        key = self.k_proj(key).view(bs, -1, self.nhead,
+                                    self.dim)  # [N, S, (H, D)]
+        value = self.v_proj(value).view(bs, -1, self.nhead, self.dim)
+        message = self.attention(
+            query, key, value, q_mask=x_mask,
+            kv_mask=source_mask)  # [N, L, (H, D)]
+        message = self.merge(message.view(bs, -1,
+                                          self.nhead * self.dim))  # [N, L, C]
+        message = self.norm1(message)
+
+        # feed-forward network
+        message = self.mlp(torch.cat([x, message], dim=2))
+        message = self.norm2(message)
+
+        return x + message
+
+
+class LocalFeatureTransformer(nn.Module):
+    """A Local Feature Transformer (LoFTR) module."""
+
+    def __init__(self, config):
+        super(LocalFeatureTransformer, self).__init__()
+
+        self.config = config
+        self.d_model = config['d_model']
+        self.nhead = config['nhead']
+        self.layer_names = config['layer_names']
+        encoder_layer = LoFTREncoderLayer(config['d_model'], config['nhead'],
+                                          config['attention'])
+        self.layers = nn.ModuleList([
+            copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names))
+        ])
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, feat0, feat1, mask0=None, mask1=None):
+        """
+        Args:
+            feat0 (torch.Tensor): [N, L, C]
+            feat1 (torch.Tensor): [N, S, C]
+            mask0 (torch.Tensor): [N, L] (optional)
+            mask1 (torch.Tensor): [N, S] (optional)
+        """
+
+        assert self.d_model == feat0.size(
+            2), 'the feature number of src and transformer must be equal'
+
+        for layer, name in zip(self.layers, self.layer_names):
+            if name == 'self':
+                feat0 = layer(feat0, feat0, mask0, mask0)
+                feat1 = layer(feat1, feat1, mask1, mask1)
+            elif name == 'cross':
+                feat0 = layer(feat0, feat1, mask0, mask1)
+                feat1 = layer(feat1, feat0, mask1, mask0)
+            else:
+                raise KeyError
+
+        return feat0, feat1
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py
new file mode 100644
index 000000000..c78356898
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py
@@ -0,0 +1,264 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops.einops import rearrange
+
+INF = 1e9
+
+
+def mask_border(m, b: int, v):
+    """ Mask borders with value
+    Args:
+        m (torch.Tensor): [N, H0, W0, H1, W1]
+        b (int)
+        v (m.dtype)
+    """
+    if b <= 0:
+        return
+
+    m[:, :b] = v
+    m[:, :, :b] = v
+    m[:, :, :, :b] = v
+    m[:, :, :, :, :b] = v
+    m[:, -b:] = v
+    m[:, :, -b:] = v
+    m[:, :, :, -b:] = v
+    m[:, :, :, :, -b:] = v
+
+
+def mask_border_with_padding(m, bd, v, p_m0, p_m1):
+    if bd <= 0:
+        return
+
+    m[:, :bd] = v
+    m[:, :, :bd] = v
+    m[:, :, :, :bd] = v
+    m[:, :, :, :, :bd] = v
+
+    h0s, w0s = p_m0.sum(1).max(-1)[0].int(), p_m0.sum(-1).max(-1)[0].int()
+    h1s, w1s = p_m1.sum(1).max(-1)[0].int(), p_m1.sum(-1).max(-1)[0].int()
+    for b_idx, (h0, w0, h1, w1) in enumerate(zip(h0s, w0s, h1s, w1s)):
+        m[b_idx, h0 - bd:] = v
+        m[b_idx, :, w0 - bd:] = v
+        m[b_idx, :, :, h1 - bd:] = v
+        m[b_idx, :, :, :, w1 - bd:] = v
+
+
+def compute_max_candidates(p_m0, p_m1):
+    """Compute the max candidates of all pairs within a batch
+
+    Args:
+        p_m0, p_m1 (torch.Tensor): padded masks
+    """
+    h0s, w0s = p_m0.sum(1).max(-1)[0], p_m0.sum(-1).max(-1)[0]
+    h1s, w1s = p_m1.sum(1).max(-1)[0], p_m1.sum(-1).max(-1)[0]
+    max_cand = torch.sum(
+        torch.min(torch.stack([h0s * w0s, h1s * w1s], -1), -1)[0])
+    return max_cand
+
+
+class CoarseMatching(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # general config
+        self.thr = config['thr']
+        self.border_rm = config['border_rm']
+        # -- # for trainig fine-level LoFTR
+        self.train_coarse_percent = config['train_coarse_percent']
+        self.train_pad_num_gt_min = config['train_pad_num_gt_min']
+
+        # we provide 2 options for differentiable matching
+        self.match_type = config['match_type']
+        if self.match_type == 'dual_softmax':
+            self.temperature = config['dsmax_temperature']
+        elif self.match_type == 'sinkhorn':
+            try:
+                from .superglue import log_optimal_transport
+            except ImportError:
+                raise ImportError('download superglue.py first!')
+            self.log_optimal_transport = log_optimal_transport
+            self.bin_score = nn.Parameter(
+                torch.tensor(config['skh_init_bin_score'], requires_grad=True))
+            self.skh_iters = config['skh_iters']
+            self.skh_prefilter = config['skh_prefilter']
+        else:
+            raise NotImplementedError()
+
+    def forward(self, feat_c0, feat_c1, data, mask_c0=None, mask_c1=None):
+        """
+        Args:
+            feat0 (torch.Tensor): [N, L, C]
+            feat1 (torch.Tensor): [N, S, C]
+            data (dict)
+            mask_c0 (torch.Tensor): [N, L] (optional)
+            mask_c1 (torch.Tensor): [N, S] (optional)
+        Update:
+            data (dict): {
+                'b_ids' (torch.Tensor): [M'],
+                'i_ids' (torch.Tensor): [M'],
+                'j_ids' (torch.Tensor): [M'],
+                'gt_mask' (torch.Tensor): [M'],
+                'mkpts0_c' (torch.Tensor): [M, 2],
+                'mkpts1_c' (torch.Tensor): [M, 2],
+                'mconf' (torch.Tensor): [M]}
+            NOTE: M' != M during training.
+        """
+        _, L, S, _ = feat_c0.size(0), feat_c0.size(1), feat_c1.size(
+            1), feat_c0.size(2)
+
+        # normalize
+        feat_c0, feat_c1 = map(lambda feat: feat / feat.shape[-1]**.5,
+                               [feat_c0, feat_c1])
+
+        if self.match_type == 'dual_softmax':
+            sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0,
+                                      feat_c1) / self.temperature
+            if mask_c0 is not None:
+                sim_matrix.masked_fill_(
+                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF)
+            conf_matrix = F.softmax(sim_matrix, 1) * F.softmax(sim_matrix, 2)
+
+        elif self.match_type == 'sinkhorn':
+            # sinkhorn, dustbin included
+            sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0, feat_c1)
+            if mask_c0 is not None:
+                sim_matrix[:, :L, :S].masked_fill_(
+                    ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF)
+
+            # build uniform prior & use sinkhorn
+            log_assign_matrix = self.log_optimal_transport(
+                sim_matrix, self.bin_score, self.skh_iters)
+            assign_matrix = log_assign_matrix.exp()
+            conf_matrix = assign_matrix[:, :-1, :-1]
+
+            # filter prediction with dustbin score (only in evaluation mode)
+            if not self.training and self.skh_prefilter:
+                filter0 = (assign_matrix.max(dim=2)[1] == S)[:, :-1]  # [N, L]
+                filter1 = (assign_matrix.max(dim=1)[1] == L)[:, :-1]  # [N, S]
+                conf_matrix[filter0[..., None].repeat(1, 1, S)] = 0
+                conf_matrix[filter1[:, None].repeat(1, L, 1)] = 0
+
+            if self.config['sparse_spvs']:
+                data.update({'conf_matrix_with_bin': assign_matrix.clone()})
+
+        data.update({'conf_matrix': conf_matrix})
+
+        # predict coarse matches from conf_matrix
+        data.update(**self.get_coarse_match(conf_matrix, data))
+
+    @torch.no_grad()
+    def get_coarse_match(self, conf_matrix, data):
+        """
+        Args:
+            conf_matrix (torch.Tensor): [N, L, S]
+            data (dict): with keys ['hw0_i', 'hw1_i', 'hw0_c', 'hw1_c']
+        Returns:
+            coarse_matches (dict): {
+                'b_ids' (torch.Tensor): [M'],
+                'i_ids' (torch.Tensor): [M'],
+                'j_ids' (torch.Tensor): [M'],
+                'gt_mask' (torch.Tensor): [M'],
+                'm_bids' (torch.Tensor): [M],
+                'mkpts0_c' (torch.Tensor): [M, 2],
+                'mkpts1_c' (torch.Tensor): [M, 2],
+                'mconf' (torch.Tensor): [M]}
+        """
+        axes_lengths = {
+            'h0c': data['hw0_c'][0],
+            'w0c': data['hw0_c'][1],
+            'h1c': data['hw1_c'][0],
+            'w1c': data['hw1_c'][1]
+        }
+        _device = conf_matrix.device
+        # 1. confidence thresholding
+        mask = conf_matrix > self.thr
+        mask = rearrange(mask, 'b (h0c w0c) (h1c w1c) -> b h0c w0c h1c w1c',
+                         **axes_lengths)
+        if 'mask0' not in data:
+            mask_border(mask, self.border_rm, False)
+        else:
+            mask_border_with_padding(mask, self.border_rm, False,
+                                     data['mask0'], data['mask1'])
+        mask = rearrange(mask, 'b h0c w0c h1c w1c -> b (h0c w0c) (h1c w1c)',
+                         **axes_lengths)
+
+        # 2. mutual nearest
+        mask = mask \
+            * (conf_matrix == conf_matrix.max(dim=2, keepdim=True)[0]) \
+            * (conf_matrix == conf_matrix.max(dim=1, keepdim=True)[0])
+
+        # 3. find all valid coarse matches
+        # this only works when at most one `True` in each row
+        mask_v, all_j_ids = mask.max(dim=2)
+        b_ids, i_ids = torch.where(mask_v)
+        j_ids = all_j_ids[b_ids, i_ids]
+        mconf = conf_matrix[b_ids, i_ids, j_ids]
+
+        # 4. Random sampling of training samples for fine-level LoFTR
+        # (optional) pad samples with gt coarse-level matches
+        if self.training:
+            # NOTE:
+            # The sampling is performed across all pairs in a batch without manually balancing
+            # #samples for fine-level increases w.r.t. batch_size
+            if 'mask0' not in data:
+                num_candidates_max = mask.size(0) * max(
+                    mask.size(1), mask.size(2))
+            else:
+                num_candidates_max = compute_max_candidates(
+                    data['mask0'], data['mask1'])
+            num_matches_train = int(num_candidates_max
+                                    * self.train_coarse_percent)
+            num_matches_pred = len(b_ids)
+            assert self.train_pad_num_gt_min < num_matches_train, 'min-num-gt-pad should be less than num-train-matches'
+
+            # pred_indices is to select from prediction
+            if num_matches_pred <= num_matches_train - self.train_pad_num_gt_min:
+                pred_indices = torch.arange(num_matches_pred, device=_device)
+            else:
+                pred_indices = torch.randint(
+                    num_matches_pred,
+                    (num_matches_train - self.train_pad_num_gt_min, ),
+                    device=_device)
+
+            # gt_pad_indices is to select from gt padding. e.g. max(3787-4800, 200)
+            gt_pad_indices = torch.randint(
+                len(data['spv_b_ids']),
+                (max(num_matches_train - num_matches_pred,
+                     self.train_pad_num_gt_min), ),
+                device=_device)
+            mconf_gt = torch.zeros(
+                len(data['spv_b_ids']),
+                device=_device)  # set conf of gt paddings to all zero
+
+            b_ids, i_ids, j_ids, mconf = map(
+                lambda x, y: torch.cat([x[pred_indices], y[gt_pad_indices]],
+                                       dim=0),
+                *zip([b_ids, data['spv_b_ids']], [i_ids, data['spv_i_ids']],
+                     [j_ids, data['spv_j_ids']], [mconf, mconf_gt]))
+
+        # These matches select patches that feed into fine-level network
+        coarse_matches = {'b_ids': b_ids, 'i_ids': i_ids, 'j_ids': j_ids}
+
+        # 4. Update with matches in original image resolution
+        scale = data['hw0_i'][0] / data['hw0_c'][0]
+        scale0 = scale * data['scale0'][b_ids] if 'scale0' in data else scale
+        scale1 = scale * data['scale1'][b_ids] if 'scale1' in data else scale
+        mkpts0_c = torch.stack(
+            [i_ids % data['hw0_c'][1], i_ids // data['hw0_c'][1]],
+            dim=1) * scale0
+        mkpts1_c = torch.stack(
+            [j_ids % data['hw1_c'][1], j_ids // data['hw1_c'][1]],
+            dim=1) * scale1
+
+        # These matches is the current prediction (for visualization)
+        coarse_matches.update({
+            'gt_mask': mconf == 0,
+            'm_bids': b_ids[mconf != 0],  # mconf == 0 => gt matches
+            'mkpts0_c': mkpts0_c[mconf != 0],
+            'mkpts1_c': mkpts1_c[mconf != 0],
+            'mconf': mconf[mconf != 0]
+        })
+
+        return coarse_matches
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/cvpr_ds_config.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/cvpr_ds_config.py
new file mode 100644
index 000000000..1c9ce7015
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/cvpr_ds_config.py
@@ -0,0 +1,50 @@
+from yacs.config import CfgNode as CN
+
+
+def lower_config(yacs_cfg):
+    if not isinstance(yacs_cfg, CN):
+        return yacs_cfg
+    return {k.lower(): lower_config(v) for k, v in yacs_cfg.items()}
+
+
+_CN = CN()
+_CN.BACKBONE_TYPE = 'ResNetFPN'
+_CN.RESOLUTION = (8, 2)  # options: [(8, 2), (16, 4)]
+_CN.FINE_WINDOW_SIZE = 5  # window_size in fine_level, must be odd
+_CN.FINE_CONCAT_COARSE_FEAT = True
+
+# 1. LoFTR-backbone (local feature CNN) config
+_CN.RESNETFPN = CN()
+_CN.RESNETFPN.INITIAL_DIM = 128
+_CN.RESNETFPN.BLOCK_DIMS = [128, 196, 256]  # s1, s2, s3
+
+# 2. LoFTR-coarse module config
+_CN.COARSE = CN()
+_CN.COARSE.D_MODEL = 256
+_CN.COARSE.D_FFN = 256
+_CN.COARSE.NHEAD = 8
+_CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4
+_CN.COARSE.ATTENTION = 'linear'  # options: ['linear', 'full']
+_CN.COARSE.TEMP_BUG_FIX = False
+
+# 3. Coarse-Matching config
+_CN.MATCH_COARSE = CN()
+_CN.MATCH_COARSE.THR = 0.2
+_CN.MATCH_COARSE.BORDER_RM = 2
+_CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax'  # options: ['dual_softmax, 'sinkhorn']
+_CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1
+_CN.MATCH_COARSE.SKH_ITERS = 3
+_CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0
+_CN.MATCH_COARSE.SKH_PREFILTER = True
+_CN.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.4  # training tricks: save GPU memory
+_CN.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200  # training tricks: avoid DDP deadlock
+
+# 4. LoFTR-fine module config
+_CN.FINE = CN()
+_CN.FINE.D_MODEL = 128
+_CN.FINE.D_FFN = 128
+_CN.FINE.NHEAD = 8
+_CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1
+_CN.FINE.ATTENTION = 'linear'
+
+default_cfg = lower_config(_CN)
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py
new file mode 100644
index 000000000..35903212d
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py
@@ -0,0 +1,171 @@
+import math
+
+import torch
+import torch.nn as nn
+
+
+def create_meshgrid(
+    height: int,
+    width: int,
+    normalized_coordinates: bool = True,
+    device=None,
+    dtype=None,
+):
+    """Generate a coordinate grid for an image.
+
+    When the flag ``normalized_coordinates`` is set to True, the grid is
+    normalized to be in the range :math:`[-1,1]` to be consistent with the pytorch
+    function :py:func:`torch.nn.functional.grid_sample`.
+
+    Args:
+        height: the image height (rows).
+        width: the image width (cols).
+        normalized_coordinates: whether to normalize
+          coordinates in the range :math:`[-1,1]` in order to be consistent with the
+          PyTorch function :py:func:`torch.nn.functional.grid_sample`.
+        device: the device on which the grid will be generated.
+        dtype: the data type of the generated grid.
+
+    Return:
+        grid tensor with shape :math:`(1, H, W, 2)`.
+
+    Example:
+        >>> create_meshgrid(2, 2)
+        tensor([[[[-1., -1.],
+                  [ 1., -1.]],
+        <BLANKLINE>
+                 [[-1.,  1.],
+                  [ 1.,  1.]]]])
+
+        >>> create_meshgrid(2, 2, normalized_coordinates=False)
+        tensor([[[[0., 0.],
+                  [1., 0.]],
+        <BLANKLINE>
+                 [[0., 1.],
+                  [1., 1.]]]])
+    """
+    xs = torch.linspace(0, width - 1, width, device=device, dtype=dtype)
+    ys = torch.linspace(0, height - 1, height, device=device, dtype=dtype)
+    if normalized_coordinates:
+        xs = (xs / (width - 1) - 0.5) * 2
+        ys = (ys / (height - 1) - 0.5) * 2
+    base_grid = torch.stack(
+        torch.meshgrid([xs, ys], indexing='ij'), dim=-1)  # WxHx2
+    return base_grid.permute(1, 0, 2).unsqueeze(0)  # 1xHxWx2
+
+
+def spatial_expectation2d(input, normalized_coordinates: bool = True):
+    r"""Compute the expectation of coordinate values using spatial probabilities.
+
+    The input heatmap is assumed to represent a valid spatial probability distribution,
+    which can be achieved using :func:`~kornia.geometry.subpixel.spatial_softmax2d`.
+
+    Args:
+        input: the input tensor representing dense spatial probabilities with shape :math:`(B, N, H, W)`.
+        normalized_coordinates: whether to return the coordinates normalized in the range
+          of :math:`[-1, 1]`. Otherwise, it will return the coordinates in the range of the input shape.
+
+    Returns:
+       expected value of the 2D coordinates with shape :math:`(B, N, 2)`. Output order of the coordinates is (x, y).
+
+    Examples:
+        >>> heatmaps = torch.tensor([[[
+        ... [0., 0., 0.],
+        ... [0., 0., 0.],
+        ... [0., 1., 0.]]]])
+        >>> spatial_expectation2d(heatmaps, False)
+        tensor([[[1., 2.]]])
+    """
+
+    batch_size, channels, height, width = input.shape
+
+    # Create coordinates grid.
+    grid = create_meshgrid(height, width, normalized_coordinates, input.device)
+    grid = grid.to(input.dtype)
+
+    pos_x = grid[..., 0].reshape(-1)
+    pos_y = grid[..., 1].reshape(-1)
+
+    input_flat = input.view(batch_size, channels, -1)
+
+    # Compute the expectation of the coordinates.
+    expected_y = torch.sum(pos_y * input_flat, -1, keepdim=True)
+    expected_x = torch.sum(pos_x * input_flat, -1, keepdim=True)
+
+    output = torch.cat([expected_x, expected_y], -1)
+
+    return output.view(batch_size, channels, 2)  # BxNx2
+
+
+class FineMatching(nn.Module):
+    """FineMatching with s2d paradigm"""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, feat_f0, feat_f1, data):
+        """
+        Args:
+            feat0 (torch.Tensor): [M, WW, C]
+            feat1 (torch.Tensor): [M, WW, C]
+            data (dict)
+        Update:
+            data (dict):{
+                'expec_f' (torch.Tensor): [M, 3],
+                'mkpts0_f' (torch.Tensor): [M, 2],
+                'mkpts1_f' (torch.Tensor): [M, 2]}
+        """
+        M, WW, C = feat_f0.shape
+        W = int(math.sqrt(WW))
+        scale = data['hw0_i'][0] / data['hw0_f'][0]
+        self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale
+
+        # corner case: if no coarse matches found
+        if M == 0:
+            assert self.training is False, 'M is always >0, when training, see coarse_matching.py'
+            # logger.warning('No matches found in coarse-level.')
+            data.update({
+                'expec_f': torch.empty(0, 3, device=feat_f0.device),
+                'mkpts0_f': data['mkpts0_c'],
+                'mkpts1_f': data['mkpts1_c'],
+            })
+            return
+
+        feat_f0_picked = feat_f0_picked = feat_f0[:, WW // 2, :]
+        sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1)
+        softmax_temp = 1. / C**.5
+        heatmap = torch.softmax(
+            softmax_temp * sim_matrix, dim=1).view(-1, W, W)
+
+        # compute coordinates from heatmap
+        coords_normalized = spatial_expectation2d(heatmap[None],
+                                                  True)[0]  # [M, 2]
+        grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape(
+            1, -1, 2)  # [1, WW, 2]
+
+        # compute std over <x, y>
+        var = torch.sum(
+            grid_normalized**2 * heatmap.view(-1, WW, 1),
+            dim=1) - coords_normalized**2  # [M, 2]
+        std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)),
+                        -1)  # [M]  clamp needed for numerical stability
+
+        # for fine-level supervision
+        data.update(
+            {'expec_f':
+             torch.cat([coords_normalized, std.unsqueeze(1)], -1)})
+
+        # compute absolute kpt coords
+        self.get_fine_match(coords_normalized, data)
+
+    @torch.no_grad()
+    def get_fine_match(self, coords_normed, data):
+        W, _, _, scale = self.W, self.WW, self.C, self.scale
+
+        # mkpts0_f and mkpts1_f
+        mkpts0_f = data['mkpts0_c']
+        scale1 = scale * data['scale1'][
+            data['b_ids']] if 'scale0' in data else scale
+        mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])]  # yapf: disable
+
+        data.update({'mkpts0_f': mkpts0_f, 'mkpts1_f': mkpts1_f})
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py
new file mode 100644
index 000000000..214a3a7af
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py
@@ -0,0 +1,57 @@
+import torch
+
+
+@torch.no_grad()
+def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1):
+    """ Warp kpts0 from I0 to I1 with depth, K and Rt
+    Also check covisibility and depth consistency.
+    Depth is consistent if relative error < 0.2 (hard-coded).
+
+    Args:
+        kpts0 (torch.Tensor): [N, L, 2] - <x, y>,
+        depth0 (torch.Tensor): [N, H, W],
+        depth1 (torch.Tensor): [N, H, W],
+        T_0to1 (torch.Tensor): [N, 3, 4],
+        K0 (torch.Tensor): [N, 3, 3],
+        K1 (torch.Tensor): [N, 3, 3],
+    Returns:
+        calculable_mask (torch.Tensor): [N, L]
+        warped_keypoints0 (torch.Tensor): [N, L, 2] <x0_hat, y1_hat>
+    """
+    kpts0_long = kpts0.round().long()
+
+    # Sample depth, get calculable_mask on depth != 0
+    kpts0_depth = torch.stack([
+        depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]]
+        for i in range(kpts0.shape[0])
+    ],
+                              dim=0)  # noqa E501
+    nonzero_mask = kpts0_depth != 0
+
+    # Unproject
+    kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])],
+                        dim=-1) * kpts0_depth[..., None]  # (N, L, 3)
+    kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1)  # (N, 3, L)
+
+    # Rigid Transform
+    w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3,
+                                                         [3]]  # (N, 3, L)
+    w_kpts0_depth_computed = w_kpts0_cam[:, 2, :]
+
+    # Project
+    w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1)  # (N, L, 3)
+    w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4
+                                     )  # (N, L, 2), +1e-4 to avoid zero depth
+
+    # Covisible Check
+    h, w = depth1.shape[1:3]
+    covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w - 1) * (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h - 1)  # noqa E501 yapf: disable
+    w_kpts0_long = w_kpts0.long()
+    w_kpts0_long[~covisible_mask, :] = 0
+
+    w_kpts0_depth = torch.stack([depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0)  # noqa E501 yapf: disable
+    consistent_mask = (
+        (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2
+    valid_mask = nonzero_mask * covisible_mask * consistent_mask
+
+    return valid_mask, w_kpts0
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py
new file mode 100644
index 000000000..c5e7355d8
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py
@@ -0,0 +1,44 @@
+import math
+
+import torch
+from torch import nn
+
+
+class PositionEncodingSine(nn.Module):
+    """
+    This is a sinusoidal position encoding that generalized to 2-dimensional images
+    """
+
+    def __init__(self, d_model, max_shape=(256, 256), temp_bug_fix=True):
+        """
+        Args:
+            max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels
+            temp_bug_fix (bool): As noted in this [issue](https://github.com/zju3dv/LoFTR/issues/41),
+                the original implementation of LoFTR includes a bug in the pos-enc impl, which has little impact
+                on the final performance. For now, we keep both impls for backward compatability.
+                We will remove the buggy impl after re-training all variants of our released models.
+        """
+        super().__init__()
+
+        pe = torch.zeros((d_model, *max_shape))
+        y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0)
+        x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0)
+        if temp_bug_fix:
+            div_term = torch.exp(torch.arange(0, d_model // 2, 2).float() * (-math.log(10000.0) / (d_model // 2)))  # noqa E501 yapf: disable
+        else:  # a buggy implementation (for backward compatability only)
+            div_term = torch.exp(torch.arange(0, d_model // 2, 2).float() * (-math.log(10000.0) / d_model // 2))  # noqa E501 yapf: disable
+        div_term = div_term[:, None, None]  # [C//4, 1, 1]
+        pe[0::4, :, :] = torch.sin(x_position * div_term)
+        pe[1::4, :, :] = torch.cos(x_position * div_term)
+        pe[2::4, :, :] = torch.sin(y_position * div_term)
+        pe[3::4, :, :] = torch.cos(y_position * div_term)
+
+        self.register_buffer(
+            'pe', pe.unsqueeze(0), persistent=False)  # [1, C, H, W]
+
+    def forward(self, x):
+        """
+        Args:
+            x: [N, C, H, W]
+        """
+        return x + self.pe[:, :, :x.size(2), :x.size(3)]
diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py
new file mode 100644
index 000000000..02d25d05d
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py
@@ -0,0 +1,160 @@
+from math import log
+
+import torch
+from einops import repeat
+from kornia.utils import create_meshgrid
+from loguru import logger
+
+from .geometry import warp_kpts
+
+#  ↓  Coarse-Level supervision  ↓  ##############
+
+
+@torch.no_grad()
+def mask_pts_at_padded_regions(grid_pt, mask):
+    """For megadepth dataset, zero-padding exists in images"""
+    mask = repeat(mask, 'n h w -> n (h w) c', c=2)
+    grid_pt[~mask.bool()] = 0
+    return grid_pt
+
+
+@torch.no_grad()
+def spvs_coarse(data, config):
+    """
+    Update:
+        data (dict): {
+            "conf_matrix_gt": [N, hw0, hw1],
+            'spv_b_ids': [M]
+            'spv_i_ids': [M]
+            'spv_j_ids': [M]
+            'spv_w_pt0_i': [N, hw0, 2], in original image resolution
+            'spv_pt1_i': [N, hw1, 2], in original image resolution
+        }
+
+    NOTE:
+        - for scannet dataset, there're 3 kinds of resolution {i, c, f}
+        - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f}
+    """
+    # 1. misc
+    device = data['image0'].device
+    N, _, H0, W0 = data['image0'].shape
+    _, _, H1, W1 = data['image1'].shape
+    scale = config['LOFTR']['RESOLUTION'][0]
+    scale0 = scale * data['scale0'][:, None] if 'scale0' in data else scale
+    scale1 = scale * data['scale1'][:, None] if 'scale1' in data else scale
+    h0, w0, h1, w1 = map(lambda x: x // scale, [H0, W0, H1, W1])
+
+    # 2. warp grids
+    # create kpts in meshgrid and resize them to image resolution
+    grid_pt0_c = create_meshgrid(h0, w0, False,
+                                 device).reshape(1, h0 * w0,
+                                                 2).repeat(N, 1,
+                                                           1)  # [N, hw, 2]
+    grid_pt0_i = scale0 * grid_pt0_c
+    grid_pt1_c = create_meshgrid(h1, w1, False,
+                                 device).reshape(1, h1 * w1,
+                                                 2).repeat(N, 1, 1)
+    grid_pt1_i = scale1 * grid_pt1_c
+
+    # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt
+    if 'mask0' in data:
+        grid_pt0_i = mask_pts_at_padded_regions(grid_pt0_i, data['mask0'])
+        grid_pt1_i = mask_pts_at_padded_regions(grid_pt1_i, data['mask1'])
+
+    # warp kpts bi-directionally and resize them to coarse-level resolution
+    # (no depth consistency check, since it leads to worse results experimentally)
+    # (unhandled edge case: points with 0-depth will be warped to the left-up corner)
+    _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'],
+                           data['T_0to1'], data['K0'], data['K1'])
+    _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'],
+                           data['T_1to0'], data['K1'], data['K0'])
+    w_pt0_c = w_pt0_i / scale1
+    w_pt1_c = w_pt1_i / scale0
+
+    # 3. check if mutual nearest neighbor
+    w_pt0_c_round = w_pt0_c[:, :, :].round().long()
+    nearest_index1 = w_pt0_c_round[..., 0] + w_pt0_c_round[..., 1] * w1
+    w_pt1_c_round = w_pt1_c[:, :, :].round().long()
+    nearest_index0 = w_pt1_c_round[..., 0] + w_pt1_c_round[..., 1] * w0
+
+    # corner case: out of boundary
+    def out_bound_mask(pt, w, h):
+        return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + (
+            pt[..., 1] >= h)
+
+    nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0
+    nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0
+
+    loop_back = torch.stack(
+        [nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)],
+        dim=0)
+    correct_0to1 = loop_back == torch.arange(
+        h0 * w0, device=device)[None].repeat(N, 1)
+    correct_0to1[:, 0] = False  # ignore the top-left corner
+
+    # 4. construct a gt conf_matrix
+    conf_matrix_gt = torch.zeros(N, h0 * w0, h1 * w1, device=device)
+    b_ids, i_ids = torch.where(correct_0to1 != 0)
+    j_ids = nearest_index1[b_ids, i_ids]
+
+    conf_matrix_gt[b_ids, i_ids, j_ids] = 1
+    data.update({'conf_matrix_gt': conf_matrix_gt})
+
+    # 5. save coarse matches(gt) for training fine level
+    if len(b_ids) == 0:
+        logger.warning(
+            f"No groundtruth coarse match found for: {data['pair_names']}")
+        # this won't affect fine-level loss calculation
+        b_ids = torch.tensor([0], device=device)
+        i_ids = torch.tensor([0], device=device)
+        j_ids = torch.tensor([0], device=device)
+
+    data.update({'spv_b_ids': b_ids, 'spv_i_ids': i_ids, 'spv_j_ids': j_ids})
+
+    # 6. save intermediate results (for fast fine-level computation)
+    data.update({'spv_w_pt0_i': w_pt0_i, 'spv_pt1_i': grid_pt1_i})
+
+
+def compute_supervision_coarse(data, config):
+    assert len(set(
+        data['dataset_name'])) == 1, 'Do not support mixed datasets training!'
+    data_source = data['dataset_name'][0]
+    if data_source.lower() in ['scannet', 'megadepth']:
+        spvs_coarse(data, config)
+    else:
+        raise ValueError(f'Unknown data source: {data_source}')
+
+
+#  ↓  Fine-Level supervision  ↓  ##############
+
+
+@torch.no_grad()
+def spvs_fine(data, config):
+    """
+    Update:
+        data (dict):{
+            "expec_f_gt": [M, 2]}
+    """
+    # 1. misc
+    # w_pt0_i, pt1_i = data.pop('spv_w_pt0_i'), data.pop('spv_pt1_i')
+    w_pt0_i, pt1_i = data['spv_w_pt0_i'], data['spv_pt1_i']
+    scale = config['LOFTR']['RESOLUTION'][1]
+    radius = config['LOFTR']['FINE_WINDOW_SIZE'] // 2
+
+    # 2. get coarse prediction
+    b_ids, i_ids, j_ids = data['b_ids'], data['i_ids'], data['j_ids']
+
+    # 3. compute gt
+    scale = scale * data['scale1'][b_ids] if 'scale0' in data else scale
+    # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later
+    expec_f_gt = (w_pt0_i[b_ids, i_ids]
+                  - pt1_i[b_ids, j_ids]) / scale / radius  # [M, 2]
+    data.update({'expec_f_gt': expec_f_gt})
+
+
+def compute_supervision_fine(data, config):
+    data_source = data['dataset_name'][0]
+    if data_source.lower() in ['scannet', 'megadepth']:
+        spvs_fine(data, config)
+    else:
+        raise NotImplementedError
diff --git a/modelscope/models/cv/image_local_feature_matching/src/utils/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/utils/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py b/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py
new file mode 100644
index 000000000..206f90374
--- /dev/null
+++ b/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py
@@ -0,0 +1,177 @@
+import bisect
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def _compute_conf_thresh(data):
+    dataset_name = data['dataset_name'][0].lower()
+    if dataset_name == 'scannet':
+        thr = 5e-4
+    elif dataset_name == 'megadepth':
+        thr = 1e-4
+    else:
+        raise ValueError(f'Unknown dataset: {dataset_name}')
+    return thr
+
+
+# --- VISUALIZATION --- #
+
+
+def make_matching_figure(img0,
+                         img1,
+                         mkpts0,
+                         mkpts1,
+                         color,
+                         kpts0=None,
+                         kpts1=None,
+                         text=[],
+                         dpi=75,
+                         path=None):
+    # draw image pair
+    assert mkpts0.shape[0] == mkpts1.shape[
+        0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}'
+    fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi)
+    axes[0].imshow(img0, cmap='gray')
+    axes[1].imshow(img1, cmap='gray')
+    for i in range(2):  # clear all frames
+        axes[i].get_yaxis().set_ticks([])
+        axes[i].get_xaxis().set_ticks([])
+        for spine in axes[i].spines.values():
+            spine.set_visible(False)
+    plt.tight_layout(pad=1)
+
+    if kpts0 is not None:
+        assert kpts1 is not None
+        axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2)
+        axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c='w', s=2)
+
+    # draw matches
+    if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0:
+        fig.canvas.draw()
+        transFigure = fig.transFigure.inverted()
+        fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0))
+        fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1))
+        fig.lines = [
+            matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]),
+                                    (fkpts0[i, 1], fkpts1[i, 1]),
+                                    transform=fig.transFigure,
+                                    c=color[i],
+                                    linewidth=1) for i in range(len(mkpts0))
+        ]
+
+        axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4)
+        axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4)
+
+    # put txts
+    txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w'
+    fig.text(
+        0.01,
+        0.99,
+        '\n'.join(text),
+        transform=fig.axes[0].transAxes,
+        fontsize=15,
+        va='top',
+        ha='left',
+        color=txt_color)
+
+    # save or return figure
+    if path:
+        plt.savefig(str(path), bbox_inches='tight', pad_inches=0)
+        plt.close()
+    else:
+        return fig
+
+
+def _make_evaluation_figure(data, b_id, alpha='dynamic'):
+    b_mask = data['m_bids'] == b_id
+    conf_thr = _compute_conf_thresh(data)
+
+    img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype(
+        np.int32)
+    img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype(
+        np.int32)
+    kpts0 = data['mkpts0_f'][b_mask].cpu().numpy()
+    kpts1 = data['mkpts1_f'][b_mask].cpu().numpy()
+
+    # for megadepth, we visualize matches on the resized image
+    if 'scale0' in data:
+        kpts0 = kpts0 / data['scale0'][b_id].cpu().numpy()[[1, 0]]
+        kpts1 = kpts1 / data['scale1'][b_id].cpu().numpy()[[1, 0]]
+
+    epi_errs = data['epi_errs'][b_mask].cpu().numpy()
+    correct_mask = epi_errs < conf_thr
+    precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0
+    n_correct = np.sum(correct_mask)
+    n_gt_matches = int(data['conf_matrix_gt'][b_id].sum().cpu())
+    recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches)
+    # recall might be larger than 1, since the calculation of conf_matrix_gt
+    # uses groundtruth depths and camera poses, but epipolar distance is used here.
+
+    # matching info
+    if alpha == 'dynamic':
+        alpha = dynamic_alpha(len(correct_mask))
+    color = error_colormap(epi_errs, conf_thr, alpha=alpha)
+
+    text = [
+        f'#Matches {len(kpts0)}',
+        f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}',
+        f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}'
+    ]
+
+    # make the figure
+    figure = make_matching_figure(img0, img1, kpts0, kpts1, color, text=text)
+    return figure
+
+
+def _make_confidence_figure(data, b_id):
+    # TODO: Implement confidence figure
+    raise NotImplementedError()
+
+
+def make_matching_figures(data, config, mode='evaluation'):
+    """ Make matching figures for a batch.
+
+    Args:
+        data (Dict): a batch updated by PL_LoFTR.
+        config (Dict): matcher config
+    Returns:
+        figures (Dict[str, List[plt.figure]]
+    """
+    assert mode in ['evaluation', 'confidence']  # 'confidence'
+    figures = {mode: []}
+    for b_id in range(data['image0'].size(0)):
+        if mode == 'evaluation':
+            fig = _make_evaluation_figure(
+                data, b_id, alpha=config.TRAINER.PLOT_MATCHES_ALPHA)
+        elif mode == 'confidence':
+            fig = _make_confidence_figure(data, b_id)
+        else:
+            raise ValueError(f'Unknown plot mode: {mode}')
+        figures[mode].append(fig)
+    return figures
+
+
+def dynamic_alpha(n_matches,
+                  milestones=[0, 300, 1000, 2000],
+                  alphas=[1.0, 0.8, 0.4, 0.2]):
+    if n_matches == 0:
+        return 1.0
+    ranges = list(zip(alphas, alphas[1:] + [None]))
+    loc = bisect.bisect_right(milestones, n_matches) - 1
+    _range = ranges[loc]
+    if _range[1] is None:
+        return _range[0]
+    return _range[1] + (milestones[loc + 1] - n_matches) / (
+        milestones[loc + 1] - milestones[loc]) * (
+            _range[0] - _range[1])
+
+
+def error_colormap(err, thr, alpha=1.0):
+    assert alpha <= 1.0 and alpha > 0, f'Invaid alpha value: {alpha}'
+    x = 1 - np.clip(err / (thr * 2), 0, 1)
+    return np.clip(
+        np.stack([2 - x * 2, x * 2,
+                  np.zeros_like(x),
+                  np.ones_like(x) * alpha], -1), 0, 1)
diff --git a/modelscope/models/cv/image_matching_fast/__init__.py b/modelscope/models/cv/image_matching_fast/__init__.py
new file mode 100644
index 000000000..ced7bc449
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/__init__.py
@@ -0,0 +1,24 @@
+# The implementation is made publicly available under the
+# Apache 2.0 license at https://github.com/cvg/LightGlue
+
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .lightglue_model import LightGlueImageMatching
+
+else:
+    _import_structure = {
+        'lightglue_model': ['LightGlueImageMatching'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_matching_fast/config/__init__.py b/modelscope/models/cv/image_matching_fast/config/__init__.py
new file mode 100644
index 000000000..84c52f690
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/config/__init__.py
@@ -0,0 +1 @@
+from .default import lightglue_default_conf
diff --git a/modelscope/models/cv/image_matching_fast/config/default.py b/modelscope/models/cv/image_matching_fast/config/default.py
new file mode 100644
index 000000000..0100b96c9
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/config/default.py
@@ -0,0 +1,15 @@
+lightglue_default_conf = {
+    'features': 'superpoint',  # superpoint disk aliked sift
+    'name': 'lightglue',  # just for interfacing
+    'input_dim': 256,  # input descriptor dimension (autoselected from weights)
+    'descriptor_dim': 256,
+    'add_scale_ori': False,
+    'n_layers': 9,
+    'num_heads': 4,
+    'flash': True,  # enable FlashAttention if available.
+    'mp': False,  # enable mixed precision
+    'depth_confidence': 0.95,  # early stopping, disable with -1
+    'width_confidence': 0.99,  # point pruning, disable with -1
+    'filter_threshold': 0.1,  # match threshold
+    'weights': None,
+}
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/__init__.py b/modelscope/models/cv/image_matching_fast/lightglue/__init__.py
new file mode 100644
index 000000000..42719c9d5
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/__init__.py
@@ -0,0 +1,6 @@
+from .aliked import ALIKED  # noqa
+from .disk import DISK  # noqa
+from .lightglue import LightGlue  # noqa
+from .sift import SIFT  # noqa
+from .superpoint import SuperPoint  # noqa
+from .utils import match_pair  # noqa
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/aliked.py b/modelscope/models/cv/image_matching_fast/lightglue/aliked.py
new file mode 100644
index 000000000..71ff4f95e
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/aliked.py
@@ -0,0 +1,762 @@
+# BSD 3-Clause License
+
+# Copyright (c) 2022, Zhao Xiaoming
+# All rights reserved.
+
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice, this
+#    list of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice,
+#    this list of conditions and the following disclaimer in the documentation
+#    and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its
+#    contributors may be used to endorse or promote products derived from
+#    this software without specific prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+# Authors:
+# Xiaoming Zhao, Xingming Wu, Weihai Chen, Peter C.Y. Chen, Qingsong Xu, and Zhengguo Li
+# Code from https://github.com/Shiaoming/ALIKED
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from kornia.color import grayscale_to_rgb
+from torch import nn
+from torch.nn.modules.utils import _pair
+from torchvision.models import resnet
+
+from .utils import Extractor
+
+
+def get_patches(tensor: torch.Tensor, required_corners: torch.Tensor,
+                ps: int) -> torch.Tensor:
+    c, h, w = tensor.shape
+    corner = (required_corners - ps / 2 + 1).long()
+    corner[:, 0] = corner[:, 0].clamp(min=0, max=w - 1 - ps)
+    corner[:, 1] = corner[:, 1].clamp(min=0, max=h - 1 - ps)
+    offset = torch.arange(0, ps)
+
+    kw = {'indexing': 'ij'} if torch.__version__ >= '1.10' else {}
+    x, y = torch.meshgrid(offset, offset, **kw)
+    patches = torch.stack((x, y)).permute(2, 1, 0).unsqueeze(2)
+    patches = patches.to(corner) + corner[None, None]
+    pts = patches.reshape(-1, 2)
+    sampled = tensor.permute(1, 2, 0)[tuple(pts.T)[::-1]]
+    sampled = sampled.reshape(ps, ps, -1, c)
+    assert sampled.shape[:3] == patches.shape[:3]
+    return sampled.permute(2, 3, 0, 1)
+
+
+def simple_nms(scores: torch.Tensor, nms_radius: int):
+    """Fast Non-maximum suppression to remove nearby points"""
+
+    zeros = torch.zeros_like(scores)
+    max_mask = scores == torch.nn.functional.max_pool2d(
+        scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)
+
+    for _ in range(2):
+        supp_mask = (
+            torch.nn.functional.max_pool2d(
+                max_mask.float(),
+                kernel_size=nms_radius * 2 + 1,
+                stride=1,
+                padding=nms_radius,
+            ) > 0)
+        supp_scores = torch.where(supp_mask, zeros, scores)
+        new_max_mask = supp_scores == torch.nn.functional.max_pool2d(
+            supp_scores,
+            kernel_size=nms_radius * 2 + 1,
+            stride=1,
+            padding=nms_radius)
+        max_mask = max_mask | (new_max_mask & (~supp_mask))
+    return torch.where(max_mask, scores, zeros)
+
+
+class DKD(nn.Module):
+
+    def __init__(
+        self,
+        radius: int = 2,
+        top_k: int = 0,
+        scores_th: float = 0.2,
+        n_limit: int = 20000,
+    ):
+        """
+        Args:
+            radius: soft detection radius, kernel size is (2 * radius + 1)
+            top_k: top_k > 0: return top k keypoints
+            scores_th: top_k <= 0 threshold mode:
+                scores_th > 0: return keypoints with scores>scores_th
+                else: return keypoints with scores > scores.mean()
+            n_limit: max number of keypoint in threshold mode
+        """
+        super().__init__()
+        self.radius = radius
+        self.top_k = top_k
+        self.scores_th = scores_th
+        self.n_limit = n_limit
+        self.kernel_size = 2 * self.radius + 1
+        self.temperature = 0.1  # tuned temperature
+        self.unfold = nn.Unfold(
+            kernel_size=self.kernel_size, padding=self.radius)
+        # local xy grid
+        x = torch.linspace(-self.radius, self.radius, self.kernel_size)
+        # (kernel_size*kernel_size) x 2 : (w,h)
+        kw = {'indexing': 'ij'} if torch.__version__ >= '1.10' else {}
+        self.hw_grid = (
+            torch.stack(torch.meshgrid([x, x], **kw)).view(2, -1).t()[:,
+                                                                      [1, 0]])
+
+    def forward(
+        self,
+        scores_map: torch.Tensor,
+        sub_pixel: bool = True,
+        image_size: Optional[torch.Tensor] = None,
+    ):
+        """
+        :param scores_map: Bx1xHxW
+        :param descriptor_map: BxCxHxW
+        :param sub_pixel: whether to use sub-pixel keypoint detection
+        :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1~1
+        """
+        b, c, h, w = scores_map.shape
+        scores_nograd = scores_map.detach()
+        nms_scores = simple_nms(scores_nograd, self.radius)
+
+        # remove border
+        nms_scores[:, :, :self.radius, :] = 0
+        nms_scores[:, :, :, :self.radius] = 0
+        if image_size is not None:
+            for i in range(scores_map.shape[0]):
+                w, h = image_size[i].long()
+                nms_scores[i, :, h.item() - self.radius:, :] = 0
+                nms_scores[i, :, :, w.item() - self.radius:] = 0
+        else:
+            nms_scores[:, :, -self.radius:, :] = 0
+            nms_scores[:, :, :, -self.radius:] = 0
+
+        # detect keypoints without grad
+        if self.top_k > 0:
+            topk = torch.topk(nms_scores.view(b, -1), self.top_k)
+            indices_keypoints = [topk.indices[i]
+                                 for i in range(b)]  # B x top_k
+        else:
+            if self.scores_th > 0:
+                masks = nms_scores > self.scores_th
+                if masks.sum() == 0:
+                    th = scores_nograd.reshape(b, -1).mean(
+                        dim=1)  # th = self.scores_th
+                    masks = nms_scores > th.reshape(b, 1, 1, 1)
+            else:
+                th = scores_nograd.reshape(b, -1).mean(
+                    dim=1)  # th = self.scores_th
+                masks = nms_scores > th.reshape(b, 1, 1, 1)
+            masks = masks.reshape(b, -1)
+
+            indices_keypoints = []  # list, B x (any size)
+            scores_view = scores_nograd.reshape(b, -1)
+            for mask, scores in zip(masks, scores_view):
+                indices = mask.nonzero()[:, 0]
+                if len(indices) > self.n_limit:
+                    kpts_sc = scores[indices]
+                    sort_idx = kpts_sc.sort(descending=True)[1]
+                    sel_idx = sort_idx[:self.n_limit]
+                    indices = indices[sel_idx]
+                indices_keypoints.append(indices)
+
+        wh = torch.tensor([w - 1, h - 1], device=scores_nograd.device)
+
+        keypoints = []
+        scoredispersitys = []
+        kptscores = []
+        if sub_pixel:
+            # detect soft keypoints with grad backpropagation
+            patches = self.unfold(scores_map)  # B x (kernel**2) x (H*W)
+            self.hw_grid = self.hw_grid.to(scores_map)  # to device
+            for b_idx in range(b):
+                patch = patches[b_idx].t()  # (H*W) x (kernel**2)
+                indices_kpt = indices_keypoints[
+                    b_idx]  # one dimension vector, say its size is M
+                patch_scores = patch[indices_kpt]  # M x (kernel**2)
+                keypoints_xy_nms = torch.stack(
+                    [
+                        indices_kpt % w,
+                        torch.div(indices_kpt, w, rounding_mode='trunc')
+                    ],
+                    dim=1,
+                )  # Mx2
+
+                # max is detached to prevent undesired backprop loops in the graph
+                max_v = patch_scores.max(dim=1).values.detach()[:, None]
+                x_exp = (
+                    (patch_scores - max_v)
+                    / self.temperature).exp()  # M * (kernel**2), in [0, 1]
+
+                # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} }
+                xy_residual = (x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None]
+                               )  # Soft-argmax, Mx2
+
+                hw_grid_dist2 = (
+                    torch.norm(
+                        (self.hw_grid[None, :, :] - xy_residual[:, None, :])
+                        / self.radius,
+                        dim=-1,
+                    )**2)
+                scoredispersity = (x_exp * hw_grid_dist2).sum(
+                    dim=1) / x_exp.sum(dim=1)
+
+                # compute result keypoints
+                keypoints_xy = keypoints_xy_nms + xy_residual
+                keypoints_xy = keypoints_xy / wh * 2 - 1  # (w,h) -> (-1~1,-1~1)
+
+                kptscore = torch.nn.functional.grid_sample(
+                    scores_map[b_idx].unsqueeze(0),
+                    keypoints_xy.view(1, 1, -1, 2),
+                    mode='bilinear',
+                    align_corners=True,
+                )[0, 0, 0, :]  # CxN
+
+                keypoints.append(keypoints_xy)
+                scoredispersitys.append(scoredispersity)
+                kptscores.append(kptscore)
+        else:
+            for b_idx in range(b):
+                indices_kpt = indices_keypoints[
+                    b_idx]  # one dimension vector, say its size is M
+                # To avoid warning: UserWarning: __floordiv__ is deprecated
+                keypoints_xy_nms = torch.stack(
+                    [
+                        indices_kpt % w,
+                        torch.div(indices_kpt, w, rounding_mode='trunc')
+                    ],
+                    dim=1,
+                )  # Mx2
+                keypoints_xy = keypoints_xy_nms / wh * 2 - 1  # (w,h) -> (-1~1,-1~1)
+                kptscore = torch.nn.functional.grid_sample(
+                    scores_map[b_idx].unsqueeze(0),
+                    keypoints_xy.view(1, 1, -1, 2),
+                    mode='bilinear',
+                    align_corners=True,
+                )[0, 0, 0, :]  # CxN
+                keypoints.append(keypoints_xy)
+                scoredispersitys.append(
+                    kptscore)  # for jit.script compatability
+                kptscores.append(kptscore)
+
+        return keypoints, scoredispersitys, kptscores
+
+
+class InputPadder(object):
+    """Pads images such that dimensions are divisible by 8"""
+
+    def __init__(self, h: int, w: int, divis_by: int = 8):
+        self.ht = h
+        self.wd = w
+        pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by
+        pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by
+        self._pad = [
+            pad_wd // 2,
+            pad_wd - pad_wd // 2,
+            pad_ht // 2,
+            pad_ht - pad_ht // 2,
+        ]
+
+    def pad(self, x: torch.Tensor):
+        assert x.ndim == 4
+        return F.pad(x, self._pad, mode='replicate')
+
+    def unpad(self, x: torch.Tensor):
+        assert x.ndim == 4
+        ht = x.shape[-2]
+        wd = x.shape[-1]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+
+
+class DeformableConv2d(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        mask=False,
+    ):
+        super(DeformableConv2d, self).__init__()
+
+        self.padding = padding
+        self.mask = mask
+
+        self.channel_num = (3 * kernel_size * kernel_size if mask else 2
+                            * kernel_size * kernel_size)
+        self.offset_conv = nn.Conv2d(
+            in_channels,
+            self.channel_num,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            bias=True,
+        )
+
+        self.regular_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=self.padding,
+            bias=bias,
+        )
+
+    def forward(self, x):
+        h, w = x.shape[2:]
+        max_offset = max(h, w) / 4.0
+
+        out = self.offset_conv(x)
+        if self.mask:
+            o1, o2, mask = torch.chunk(out, 3, dim=1)
+            offset = torch.cat((o1, o2), dim=1)
+            mask = torch.sigmoid(mask)
+        else:
+            offset = out
+            mask = None
+        offset = offset.clamp(-max_offset, max_offset)
+        x = torchvision.ops.deform_conv2d(
+            input=x,
+            offset=offset,
+            weight=self.regular_conv.weight,
+            bias=self.regular_conv.bias,
+            padding=self.padding,
+            mask=mask,
+        )
+        return x
+
+
+def get_conv(
+    inplanes,
+    planes,
+    kernel_size=3,
+    stride=1,
+    padding=1,
+    bias=False,
+    conv_type='conv',
+    mask=False,
+):
+    if conv_type == 'conv':
+        conv = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    elif conv_type == 'dcn':
+        conv = DeformableConv2d(
+            inplanes,
+            planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=_pair(padding),
+            bias=bias,
+            mask=mask,
+        )
+    else:
+        raise TypeError
+    return conv
+
+
+class ConvBlock(nn.Module):
+
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        gate: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        conv_type: str = 'conv',
+        mask: bool = False,
+    ):
+        super().__init__()
+        if gate is None:
+            self.gate = nn.ReLU(inplace=True)
+        else:
+            self.gate = gate
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self.conv1 = get_conv(
+            in_channels,
+            out_channels,
+            kernel_size=3,
+            conv_type=conv_type,
+            mask=mask)
+        self.bn1 = norm_layer(out_channels)
+        self.conv2 = get_conv(
+            out_channels,
+            out_channels,
+            kernel_size=3,
+            conv_type=conv_type,
+            mask=mask)
+        self.bn2 = norm_layer(out_channels)
+
+    def forward(self, x):
+        x = self.gate(self.bn1(self.conv1(x)))  # B x in_channels x H x W
+        x = self.gate(self.bn2(self.conv2(x)))  # B x out_channels x H x W
+        return x
+
+
+# modified based on torchvision\models\resnet.py#27->BasicBlock
+class ResBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        gate: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        conv_type: str = 'conv',
+        mask: bool = False,
+    ) -> None:
+        super(ResBlock, self).__init__()
+        if gate is None:
+            self.gate = nn.ReLU(inplace=True)
+        else:
+            self.gate = gate
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'ResBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError('Dilation > 1 not supported in ResBlock')
+        # Both self.conv1 and self.downsample layers
+        # downsample the input when stride != 1
+        self.conv1 = get_conv(
+            inplanes, planes, kernel_size=3, conv_type=conv_type, mask=mask)
+        self.bn1 = norm_layer(planes)
+        self.conv2 = get_conv(
+            planes, planes, kernel_size=3, conv_type=conv_type, mask=mask)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.gate(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.gate(out)
+
+        return out
+
+
+class SDDH(nn.Module):
+
+    def __init__(
+            self,
+            dims: int,
+            kernel_size: int = 3,
+            n_pos: int = 8,
+            gate=nn.ReLU(),
+            conv2D=False,
+            mask=False,
+    ):
+        super(SDDH, self).__init__()
+        self.kernel_size = kernel_size
+        self.n_pos = n_pos
+        self.conv2D = conv2D
+        self.mask = mask
+
+        self.get_patches_func = get_patches
+
+        # estimate offsets
+        self.channel_num = 3 * n_pos if mask else 2 * n_pos
+        self.offset_conv = nn.Sequential(
+            nn.Conv2d(
+                dims,
+                self.channel_num,
+                kernel_size=kernel_size,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+            gate,
+            nn.Conv2d(
+                self.channel_num,
+                self.channel_num,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            ),
+        )
+
+        # sampled feature conv
+        self.sf_conv = nn.Conv2d(
+            dims, dims, kernel_size=1, stride=1, padding=0, bias=False)
+
+        # convM
+        if not conv2D:
+            # deformable desc weights
+            agg_weights = torch.nn.Parameter(torch.rand(n_pos, dims, dims))
+            self.register_parameter('agg_weights', agg_weights)
+        else:
+            self.convM = nn.Conv2d(
+                dims * n_pos,
+                dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False)
+
+    def forward(self, x, keypoints):
+        # x: [B,C,H,W]
+        # keypoints: list, [[N_kpts,2], ...] (w,h)
+        b, c, h, w = x.shape
+        wh = torch.tensor([[w - 1, h - 1]], device=x.device)
+        max_offset = max(h, w) / 4.0
+
+        offsets = []
+        descriptors = []
+        # get offsets for each keypoint
+        for ib in range(b):
+            xi, kptsi = x[ib], keypoints[ib]
+            kptsi_wh = (kptsi / 2 + 0.5) * wh
+            N_kpts = len(kptsi)
+
+            if self.kernel_size > 1:
+                patch = self.get_patches_func(
+                    xi, kptsi_wh.long(), self.kernel_size)  # [N_kpts, C, K, K]
+            else:
+                kptsi_wh_long = kptsi_wh.long()
+                patch = (
+                    xi[:, kptsi_wh_long[:, 1],
+                       kptsi_wh_long[:,
+                                     0]].permute(1,
+                                                 0).reshape(N_kpts, c, 1, 1))
+
+            offset = self.offset_conv(patch).clamp(
+                -max_offset, max_offset)  # [N_kpts, 2*n_pos, 1, 1]
+            if self.mask:
+                offset = (offset[:, :, 0, 0].view(N_kpts, 3,
+                                                  self.n_pos).permute(0, 2, 1)
+                          )  # [N_kpts, n_pos, 3]
+                offset = offset[:, :, :-1]  # [N_kpts, n_pos, 2]
+                mask_weight = torch.sigmoid(offset[:, :,
+                                                   -1])  # [N_kpts, n_pos]
+            else:
+                offset = (offset[:, :, 0, 0].view(N_kpts, 2,
+                                                  self.n_pos).permute(0, 2, 1)
+                          )  # [N_kpts, n_pos, 2]
+            offsets.append(offset)  # for visualization
+
+            # get sample positions
+            pos = kptsi_wh.unsqueeze(1) + offset  # [N_kpts, n_pos, 2]
+            pos = 2.0 * pos / wh[None] - 1
+            pos = pos.reshape(1, N_kpts * self.n_pos, 1, 2)
+
+            # sample features
+            features = F.grid_sample(
+                xi.unsqueeze(0), pos, mode='bilinear',
+                align_corners=True)  # [1,C,(N_kpts*n_pos),1]
+            features = features.reshape(c, N_kpts, self.n_pos,
+                                        1).permute(1, 0, 2,
+                                                   3)  # [N_kpts, C, n_pos, 1]
+            if self.mask:
+                features = torch.einsum('ncpo,np->ncpo', features, mask_weight)
+
+            features = torch.selu_(self.sf_conv(features)).squeeze(
+                -1)  # [N_kpts, C, n_pos]
+            # convM
+            if not self.conv2D:
+                descs = torch.einsum('ncp,pcd->nd', features,
+                                     self.agg_weights)  # [N_kpts, C]
+            else:
+                features = features.reshape(
+                    N_kpts, -1)[:, :, None, None]  # [N_kpts, C*n_pos, 1, 1]
+                descs = self.convM(features).squeeze()  # [N_kpts, C]
+
+            # normalize
+            descs = F.normalize(descs, p=2.0, dim=1)
+            descriptors.append(descs)
+
+        return descriptors, offsets
+
+
+class ALIKED(Extractor):
+    default_conf = {
+        'model_name': 'aliked-n16',
+        'max_num_keypoints': -1,
+        'detection_threshold': 0.2,
+        'nms_radius': 2,
+    }
+
+    checkpoint_url = 'https://github.com/Shiaoming/ALIKED/raw/main/models/{}.pth'
+
+    n_limit_max = 20000
+
+    # c1, c2, c3, c4, dim, K, M
+    cfgs = {
+        'aliked-t16': [8, 16, 32, 64, 64, 3, 16],
+        'aliked-n16': [16, 32, 64, 128, 128, 3, 16],
+        'aliked-n16rot': [16, 32, 64, 128, 128, 3, 16],
+        'aliked-n32': [16, 32, 64, 128, 128, 3, 32],
+    }
+    preprocess_conf = {
+        'resize': 1024,
+    }
+
+    required_data_keys = ['image']
+
+    def __init__(self, **conf):
+        super().__init__(**conf)  # Update with default configuration.
+        conf = self.conf
+        c1, c2, c3, c4, dim, K, M = self.cfgs[conf.model_name]
+        conv_types = ['conv', 'conv', 'dcn', 'dcn']
+        conv2D = False
+        mask = False
+
+        # build model
+        self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2)
+        self.pool4 = nn.AvgPool2d(kernel_size=4, stride=4)
+        self.norm = nn.BatchNorm2d
+        self.gate = nn.SELU(inplace=True)
+        self.block1 = ConvBlock(
+            3, c1, self.gate, self.norm, conv_type=conv_types[0])
+        self.block2 = self.get_resblock(c1, c2, conv_types[1], mask)
+        self.block3 = self.get_resblock(c2, c3, conv_types[2], mask)
+        self.block4 = self.get_resblock(c3, c4, conv_types[3], mask)
+
+        self.conv1 = resnet.conv1x1(c1, dim // 4)
+        self.conv2 = resnet.conv1x1(c2, dim // 4)
+        self.conv3 = resnet.conv1x1(c3, dim // 4)
+        self.conv4 = resnet.conv1x1(dim, dim // 4)
+        self.upsample2 = nn.Upsample(
+            scale_factor=2, mode='bilinear', align_corners=True)
+        self.upsample4 = nn.Upsample(
+            scale_factor=4, mode='bilinear', align_corners=True)
+        self.upsample8 = nn.Upsample(
+            scale_factor=8, mode='bilinear', align_corners=True)
+        self.upsample32 = nn.Upsample(
+            scale_factor=32, mode='bilinear', align_corners=True)
+        self.score_head = nn.Sequential(
+            resnet.conv1x1(dim, 8),
+            self.gate,
+            resnet.conv3x3(8, 4),
+            self.gate,
+            resnet.conv3x3(4, 4),
+            self.gate,
+            resnet.conv3x3(4, 1),
+        )
+        self.desc_head = SDDH(
+            dim, K, M, gate=self.gate, conv2D=conv2D, mask=mask)
+        self.dkd = DKD(
+            radius=conf.nms_radius,
+            top_k=-1
+            if conf.detection_threshold > 0 else conf.max_num_keypoints,
+            scores_th=conf.detection_threshold,
+            n_limit=conf.max_num_keypoints
+            if conf.max_num_keypoints > 0 else self.n_limit_max,
+        )
+
+        state_dict = torch.hub.load_state_dict_from_url(
+            self.checkpoint_url.format(conf.model_name), map_location='cpu')
+        self.load_state_dict(state_dict, strict=True)
+
+    def get_resblock(self, c_in, c_out, conv_type, mask):
+        return ResBlock(
+            c_in,
+            c_out,
+            1,
+            nn.Conv2d(c_in, c_out, 1),
+            gate=self.gate,
+            norm_layer=self.norm,
+            conv_type=conv_type,
+            mask=mask,
+        )
+
+    def extract_dense_map(self, image):
+        # Pads images such that dimensions are divisible by
+        div_by = 2**5
+        padder = InputPadder(image.shape[-2], image.shape[-1], div_by)
+        image = padder.pad(image)
+
+        # ================================== feature encoder
+        x1 = self.block1(image)  # B x c1 x H x W
+        x2 = self.pool2(x1)
+        x2 = self.block2(x2)  # B x c2 x H/2 x W/2
+        x3 = self.pool4(x2)
+        x3 = self.block3(x3)  # B x c3 x H/8 x W/8
+        x4 = self.pool4(x3)
+        x4 = self.block4(x4)  # B x dim x H/32 x W/32
+        # ================================== feature aggregation
+        x1 = self.gate(self.conv1(x1))  # B x dim//4 x H x W
+        x2 = self.gate(self.conv2(x2))  # B x dim//4 x H//2 x W//2
+        x3 = self.gate(self.conv3(x3))  # B x dim//4 x H//8 x W//8
+        x4 = self.gate(self.conv4(x4))  # B x dim//4 x H//32 x W//32
+        x2_up = self.upsample2(x2)  # B x dim//4 x H x W
+        x3_up = self.upsample8(x3)  # B x dim//4 x H x W
+        x4_up = self.upsample32(x4)  # B x dim//4 x H x W
+        x1234 = torch.cat([x1, x2_up, x3_up, x4_up], dim=1)
+        # ================================== score head
+        score_map = torch.sigmoid(self.score_head(x1234))
+        feature_map = torch.nn.functional.normalize(x1234, p=2, dim=1)
+
+        # Unpads images
+        feature_map = padder.unpad(feature_map)
+        score_map = padder.unpad(score_map)
+
+        return feature_map, score_map
+
+    def forward(self, data: dict) -> dict:
+        image = data['image']
+        if image.shape[1] == 1:
+            image = grayscale_to_rgb(image)
+        feature_map, score_map = self.extract_dense_map(image)
+        keypoints, kptscores, scoredispersitys = self.dkd(
+            score_map, image_size=data.get('image_size'))
+        descriptors, offsets = self.desc_head(feature_map, keypoints)
+
+        _, _, h, w = image.shape
+        wh = torch.tensor([w - 1, h - 1], device=image.device)
+        # no padding required
+        # we can set detection_threshold=-1 and conf.max_num_keypoints > 0
+        return {
+            'keypoints': wh * (torch.stack(keypoints) + 1) / 2.0,  # B x N x 2
+            'descriptors': torch.stack(descriptors),  # B x N x D
+            'keypoint_scores': torch.stack(kptscores),  # B x N
+        }
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/disk.py b/modelscope/models/cv/image_matching_fast/lightglue/disk.py
new file mode 100644
index 000000000..08d521c44
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/disk.py
@@ -0,0 +1,55 @@
+import kornia
+import torch
+
+from .utils import Extractor
+
+
+class DISK(Extractor):
+    default_conf = {
+        'weights': 'depth',
+        'max_num_keypoints': None,
+        'desc_dim': 128,
+        'nms_window_size': 5,
+        'detection_threshold': 0.0,
+        'pad_if_not_divisible': True,
+    }
+
+    preprocess_conf = {
+        'resize': 1024,
+        'grayscale': False,
+    }
+
+    required_data_keys = ['image']
+
+    def __init__(self, **conf) -> None:
+        super().__init__(**conf)  # Update with default configuration.
+        self.model = kornia.feature.DISK.from_pretrained(self.conf.weights)
+
+    def forward(self, data: dict) -> dict:
+        """Compute keypoints, scores, descriptors for image"""
+        for key in self.required_data_keys:
+            assert key in data, f'Missing key {key} in data'
+        image = data['image']
+        if image.shape[1] == 1:
+            image = kornia.color.grayscale_to_rgb(image)
+        features = self.model(
+            image,
+            n=self.conf.max_num_keypoints,
+            window_size=self.conf.nms_window_size,
+            score_threshold=self.conf.detection_threshold,
+            pad_if_not_divisible=self.conf.pad_if_not_divisible,
+        )
+        keypoints = [f.keypoints for f in features]
+        scores = [f.detection_scores for f in features]
+        descriptors = [f.descriptors for f in features]
+        del features
+
+        keypoints = torch.stack(keypoints, 0)
+        scores = torch.stack(scores, 0)
+        descriptors = torch.stack(descriptors, 0)
+
+        return {
+            'keypoints': keypoints.to(image).contiguous(),
+            'keypoint_scores': scores.to(image).contiguous(),
+            'descriptors': descriptors.to(image).contiguous(),
+        }
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py b/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py
new file mode 100644
index 000000000..16888b556
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py
@@ -0,0 +1,641 @@
+import os.path as osp
+import warnings
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Callable, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+try:
+    from flash_attn.modules.mha import FlashCrossAttention
+except ModuleNotFoundError:
+    FlashCrossAttention = None
+
+if FlashCrossAttention or hasattr(F, 'scaled_dot_product_attention'):
+    FLASH_AVAILABLE = True
+else:
+    FLASH_AVAILABLE = False
+
+torch.backends.cudnn.deterministic = True
+
+
+@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32)
+def normalize_keypoints(kpts: torch.Tensor,
+                        size: Optional[torch.Tensor] = None) -> torch.Tensor:
+    if size is None:
+        size = 1 + kpts.max(-2).values - kpts.min(-2).values
+    elif not isinstance(size, torch.Tensor):
+        size = torch.tensor(size, device=kpts.device, dtype=kpts.dtype)
+    size = size.to(kpts)
+    shift = size / 2
+    scale = size.max(-1).values / 2
+    kpts = (kpts - shift[..., None, :]) / scale[..., None, None]
+    return kpts
+
+
+def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]:
+    if length <= x.shape[-2]:
+        return x, torch.ones_like(x[..., :1], dtype=torch.bool)
+    pad = torch.ones(
+        *x.shape[:-2],
+        length - x.shape[-2],
+        x.shape[-1],
+        device=x.device,
+        dtype=x.dtype)
+    y = torch.cat([x, pad], dim=-2)
+    mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device)
+    mask[..., :x.shape[-2], :] = True
+    return y, mask
+
+
+def rotate_half(x: torch.Tensor) -> torch.Tensor:
+    x = x.unflatten(-1, (-1, 2))
+    x1, x2 = x.unbind(dim=-1)
+    return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2)
+
+
+def apply_cached_rotary_emb(freqs: torch.Tensor,
+                            t: torch.Tensor) -> torch.Tensor:
+    return (t * freqs[0]) + (rotate_half(t) * freqs[1])
+
+
+class LearnableFourierPositionalEncoding(nn.Module):
+
+    def __init__(self,
+                 M: int,
+                 dim: int,
+                 F_dim: int = None,
+                 gamma: float = 1.0) -> None:
+        super().__init__()
+        F_dim = F_dim if F_dim is not None else dim
+        self.gamma = gamma
+        self.Wr = nn.Linear(M, F_dim // 2, bias=False)
+        nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma**-2)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """encode position vector"""
+        projected = self.Wr(x)
+        cosines, sines = torch.cos(projected), torch.sin(projected)
+        emb = torch.stack([cosines, sines], 0).unsqueeze(-3)
+        return emb.repeat_interleave(2, dim=-1)
+
+
+class TokenConfidence(nn.Module):
+
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid())
+
+    def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
+        """get confidence tokens"""
+        return (
+            self.token(desc0.detach()).squeeze(-1),
+            self.token(desc1.detach()).squeeze(-1),
+        )
+
+
+class Attention(nn.Module):
+
+    def __init__(self, allow_flash: bool) -> None:
+        super().__init__()
+        if allow_flash and not FLASH_AVAILABLE:
+            warnings.warn(
+                'FlashAttention is not available. For optimal speed, '
+                'consider installing torch >= 2.0 or flash-attn.',
+                stacklevel=2,
+            )
+        self.enable_flash = allow_flash and FLASH_AVAILABLE
+        self.has_sdp = hasattr(F, 'scaled_dot_product_attention')
+        if allow_flash and FlashCrossAttention:
+            self.flash_ = FlashCrossAttention()
+        if self.has_sdp:
+            torch.backends.cuda.enable_flash_sdp(allow_flash)
+
+    def forward(self,
+                q,
+                k,
+                v,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if self.enable_flash and q.device.type == 'cuda':
+            # use torch 2.0 scaled_dot_product_attention with flash
+            if self.has_sdp:
+                args = [x.half().contiguous() for x in [q, k, v]]
+                v = F.scaled_dot_product_attention(
+                    *args, attn_mask=mask).to(q.dtype)
+                return v if mask is None else v.nan_to_num()
+            else:
+                assert mask is None
+                q, k, v = [x.transpose(-2, -3).contiguous() for x in [q, k, v]]
+                m = self.flash_(q.half(), torch.stack([k, v], 2).half())
+                return m.transpose(-2, -3).to(q.dtype).clone()
+        elif self.has_sdp:
+            args = [x.contiguous() for x in [q, k, v]]
+            v = F.scaled_dot_product_attention(*args, attn_mask=mask)
+            return v if mask is None else v.nan_to_num()
+        else:
+            s = q.shape[-1]**-0.5
+            sim = torch.einsum('...id,...jd->...ij', q, k) * s
+            if mask is not None:
+                sim.masked_fill(~mask, -float('inf'))
+            attn = F.softmax(sim, -1)
+            return torch.einsum('...ij,...jd->...id', attn, v)
+
+
+class SelfBlock(nn.Module):
+
+    def __init__(self,
+                 embed_dim: int,
+                 num_heads: int,
+                 flash: bool = False,
+                 bias: bool = True) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        assert self.embed_dim % num_heads == 0
+        self.head_dim = self.embed_dim // num_heads
+        self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias)
+        self.inner_attn = Attention(flash)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.ffn = nn.Sequential(
+            nn.Linear(2 * embed_dim, 2 * embed_dim),
+            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(2 * embed_dim, embed_dim),
+        )
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        encoding: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        qkv = self.Wqkv(x)
+        qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2)
+        q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2]
+        q = apply_cached_rotary_emb(encoding, q)
+        k = apply_cached_rotary_emb(encoding, k)
+        context = self.inner_attn(q, k, v, mask=mask)
+        message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2))
+        return x + self.ffn(torch.cat([x, message], -1))
+
+
+class CrossBlock(nn.Module):
+
+    def __init__(self,
+                 embed_dim: int,
+                 num_heads: int,
+                 flash: bool = False,
+                 bias: bool = True) -> None:
+        super().__init__()
+        self.heads = num_heads
+        dim_head = embed_dim // num_heads
+        self.scale = dim_head**-0.5
+        inner_dim = dim_head * num_heads
+        self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias)
+        self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias)
+        self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias)
+        self.ffn = nn.Sequential(
+            nn.Linear(2 * embed_dim, 2 * embed_dim),
+            nn.LayerNorm(2 * embed_dim, elementwise_affine=True),
+            nn.GELU(),
+            nn.Linear(2 * embed_dim, embed_dim),
+        )
+        if flash and FLASH_AVAILABLE:
+            self.flash = Attention(True)
+        else:
+            self.flash = None
+
+    def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor):
+        return func(x0), func(x1)
+
+    def forward(self,
+                x0: torch.Tensor,
+                x1: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> List[torch.Tensor]:
+        qk0, qk1 = self.map_(self.to_qk, x0, x1)
+        v0, v1 = self.map_(self.to_v, x0, x1)
+        qk0, qk1, v0, v1 = map(
+            lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2),
+            (qk0, qk1, v0, v1),
+        )
+        if self.flash is not None and qk0.device.type == 'cuda':
+            m0 = self.flash(qk0, qk1, v1, mask)
+            m1 = self.flash(
+                qk1, qk0, v0,
+                mask.transpose(-1, -2) if mask is not None else None)
+        else:
+            qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5
+            sim = torch.einsum('bhid, bhjd -> bhij', qk0, qk1)
+            if mask is not None:
+                sim = sim.masked_fill(~mask, -float('inf'))
+            attn01 = F.softmax(sim, dim=-1)
+            attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1)
+            m0 = torch.einsum('bhij, bhjd -> bhid', attn01, v1)
+            m1 = torch.einsum('bhji, bhjd -> bhid', attn10.transpose(-2, -1),
+                              v0)
+            if mask is not None:
+                m0, m1 = m0.nan_to_num(), m1.nan_to_num()
+        m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2),
+                           m0, m1)
+        m0, m1 = self.map_(self.to_out, m0, m1)
+        x0 = x0 + self.ffn(torch.cat([x0, m0], -1))
+        x1 = x1 + self.ffn(torch.cat([x1, m1], -1))
+        return x0, x1
+
+
+class TransformerLayer(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self.self_attn = SelfBlock(*args, **kwargs)
+        self.cross_attn = CrossBlock(*args, **kwargs)
+
+    def forward(
+        self,
+        desc0,
+        desc1,
+        encoding0,
+        encoding1,
+        mask0: Optional[torch.Tensor] = None,
+        mask1: Optional[torch.Tensor] = None,
+    ):
+        if mask0 is not None and mask1 is not None:
+            return self.masked_forward(desc0, desc1, encoding0, encoding1,
+                                       mask0, mask1)
+        else:
+            desc0 = self.self_attn(desc0, encoding0)
+            desc1 = self.self_attn(desc1, encoding1)
+            return self.cross_attn(desc0, desc1)
+
+    # This part is compiled and allows padding inputs
+    def masked_forward(self, desc0, desc1, encoding0, encoding1, mask0, mask1):
+        mask = mask0 & mask1.transpose(-1, -2)
+        mask0 = mask0 & mask0.transpose(-1, -2)
+        mask1 = mask1 & mask1.transpose(-1, -2)
+        desc0 = self.self_attn(desc0, encoding0, mask0)
+        desc1 = self.self_attn(desc1, encoding1, mask1)
+        return self.cross_attn(desc0, desc1, mask)
+
+
+def sigmoid_log_double_softmax(sim: torch.Tensor, z0: torch.Tensor,
+                               z1: torch.Tensor) -> torch.Tensor:
+    """create the log assignment matrix from logits and similarity"""
+    b, m, n = sim.shape
+    certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2)
+    scores0 = F.log_softmax(sim, 2)
+    scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(),
+                            2).transpose(-1, -2)
+    scores = sim.new_full((b, m + 1, n + 1), 0)
+    scores[:, :m, :n] = scores0 + scores1 + certainties
+    scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1))
+    scores[:, -1, :-1] = F.logsigmoid(-z1.squeeze(-1))
+    return scores
+
+
+class MatchAssignment(nn.Module):
+
+    def __init__(self, dim: int) -> None:
+        super().__init__()
+        self.dim = dim
+        self.matchability = nn.Linear(dim, 1, bias=True)
+        self.final_proj = nn.Linear(dim, dim, bias=True)
+
+    def forward(self, desc0: torch.Tensor, desc1: torch.Tensor):
+        """build assignment matrix from descriptors"""
+        mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1)
+        _, _, d = mdesc0.shape
+        mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25
+        sim = torch.einsum('bmd,bnd->bmn', mdesc0, mdesc1)
+        z0 = self.matchability(desc0)
+        z1 = self.matchability(desc1)
+        scores = sigmoid_log_double_softmax(sim, z0, z1)
+        return scores, sim
+
+    def get_matchability(self, desc: torch.Tensor):
+        return torch.sigmoid(self.matchability(desc)).squeeze(-1)
+
+
+def filter_matches(scores: torch.Tensor, th: float):
+    """obtain matches from a log assignment matrix [Bx M+1 x N+1]"""
+    max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1)
+    m0, m1 = max0.indices, max1.indices
+    indices0 = torch.arange(m0.shape[1], device=m0.device)[None]
+    indices1 = torch.arange(m1.shape[1], device=m1.device)[None]
+    mutual0 = indices0 == m1.gather(1, m0)
+    mutual1 = indices1 == m0.gather(1, m1)
+    max0_exp = max0.values.exp()
+    zero = max0_exp.new_tensor(0)
+    mscores0 = torch.where(mutual0, max0_exp, zero)
+    mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero)
+    valid0 = mutual0 & (mscores0 > th)
+    valid1 = mutual1 & valid0.gather(1, m1)
+    m0 = torch.where(valid0, m0, -1)
+    m1 = torch.where(valid1, m1, -1)
+    return m0, m1, mscores0, mscores1
+
+
+class LightGlue(nn.Module):
+
+    # Point pruning involves an overhead (gather).
+    # Therefore, we only activate it if there are enough keypoints.
+    pruning_keypoint_thresholds = {
+        'cpu': -1,
+        'mps': -1,
+        'cuda': 1024,
+        'flash': 1536,
+    }
+
+    required_data_keys = ['image0', 'image1']
+
+    version = 'v0.1_arxiv'
+    weight_path = '{}_lightglue.pth'
+
+    features = {
+        'superpoint': {
+            'weights': 'superpoint_lightglue',
+            'input_dim': 256,
+        },
+        'disk': {
+            'weights': 'disk_lightglue',
+            'input_dim': 128,
+        },
+        'aliked': {
+            'weights': 'aliked_lightglue',
+            'input_dim': 128,
+        },
+        'sift': {
+            'weights': 'sift_lightglue',
+            'input_dim': 128,
+            'add_scale_ori': True,
+        },
+    }
+
+    def __init__(self, model_dir, default_conf, **conf) -> None:
+        super().__init__()
+        self.conf = conf = SimpleNamespace(**{**default_conf, **conf})
+        if conf.features is not None:
+            if conf.features not in self.features:
+                raise ValueError(
+                    f'Unsupported features: {conf.features} not in '
+                    f"{{{','.join(self.features)}}}")
+            for k, v in self.features[conf.features].items():
+                setattr(conf, k, v)
+
+        if conf.input_dim != conf.descriptor_dim:
+            self.input_proj = nn.Linear(
+                conf.input_dim, conf.descriptor_dim, bias=True)
+        else:
+            self.input_proj = nn.Identity()
+
+        head_dim = conf.descriptor_dim // conf.num_heads
+        self.posenc = LearnableFourierPositionalEncoding(
+            2 + 2 * self.conf.add_scale_ori, head_dim, head_dim)
+
+        h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim
+
+        self.transformers = nn.ModuleList(
+            [TransformerLayer(d, h, conf.flash) for _ in range(n)])
+
+        self.log_assignment = nn.ModuleList(
+            [MatchAssignment(d) for _ in range(n)])
+        self.token_confidence = nn.ModuleList(
+            [TokenConfidence(d) for _ in range(n - 1)])
+        self.register_buffer(
+            'confidence_thresholds',
+            torch.Tensor([
+                self.confidence_threshold(i) for i in range(self.conf.n_layers)
+            ]),
+        )
+
+        state_dict = None
+        if conf.features is not None:
+            state_dict = torch.load(
+                osp.join(model_dir, self.weight_path.format(conf.features)),
+                map_location='cpu')
+            self.load_state_dict(state_dict, strict=False)
+        elif conf.weights is not None:
+            path = Path(__file__).parent
+            path = path / 'weights/{}.pth'.format(self.conf.weights)
+            state_dict = torch.load(str(path), map_location='cpu')
+
+        if state_dict:
+            # rename old state dict entries
+            for i in range(self.conf.n_layers):
+                pattern = f'self_attn.{i}', f'transformers.{i}.self_attn'
+                state_dict = {
+                    k.replace(*pattern): v
+                    for k, v in state_dict.items()
+                }
+                pattern = f'cross_attn.{i}', f'transformers.{i}.cross_attn'
+                state_dict = {
+                    k.replace(*pattern): v
+                    for k, v in state_dict.items()
+                }
+            self.load_state_dict(state_dict, strict=False)
+
+        # static lengths LightGlue is compiled for (only used with torch.compile)
+        self.static_lengths = None
+
+    def compile(self,
+                mode='reduce-overhead',
+                static_lengths=[256, 512, 768, 1024, 1280, 1536]):
+        if self.conf.width_confidence != -1:
+            warnings.warn(
+                'Point pruning is partially disabled for compiled forward.',
+                stacklevel=2,
+            )
+
+        for i in range(self.conf.n_layers):
+            self.transformers[i].masked_forward = torch.compile(
+                self.transformers[i].masked_forward, mode=mode, fullgraph=True)
+
+        self.static_lengths = static_lengths
+
+    def forward(self, data: dict) -> dict:
+        """
+        Match keypoints and descriptors between two images
+
+        Input (dict):
+            image0: dict
+                keypoints: [B x M x 2]
+                descriptors: [B x M x D]
+                image: [B x C x H x W] or image_size: [B x 2]
+            image1: dict
+                keypoints: [B x N x 2]
+                descriptors: [B x N x D]
+                image: [B x C x H x W] or image_size: [B x 2]
+        Output (dict):
+            log_assignment: [B x M+1 x N+1]
+            matches0: [B x M]
+            matching_scores0: [B x M]
+            matches1: [B x N]
+            matching_scores1: [B x N]
+            matches: List[[Si x 2]], scores: List[[Si]]
+        """
+        with torch.autocast(enabled=self.conf.mp, device_type='cuda'):
+            return self._forward(data)
+
+    def _forward(self, data: dict) -> dict:
+        for key in self.required_data_keys:
+            assert key in data, f'Missing key {key} in data'
+        data0, data1 = data['image0'], data['image1']
+        kpts0, kpts1 = data0['keypoints'], data1['keypoints']
+        b, m, _ = kpts0.shape
+        b, n, _ = kpts1.shape
+        device = kpts0.device
+        size0, size1 = data0.get('image_size'), data1.get('image_size')
+        kpts0 = normalize_keypoints(kpts0, size0).clone()
+        kpts1 = normalize_keypoints(kpts1, size1).clone()
+
+        if self.conf.add_scale_ori:
+            kpts0 = torch.cat(
+                [kpts0] + [data0[k].unsqueeze(-1) for k in ('scales', 'oris')],
+                -1)
+            kpts1 = torch.cat(
+                [kpts1] + [data1[k].unsqueeze(-1) for k in ('scales', 'oris')],
+                -1)
+        desc0 = data0['descriptors'].detach().contiguous()
+        desc1 = data1['descriptors'].detach().contiguous()
+
+        assert desc0.shape[-1] == self.conf.input_dim
+        assert desc1.shape[-1] == self.conf.input_dim
+
+        if torch.is_autocast_enabled():
+            desc0 = desc0.half()
+            desc1 = desc1.half()
+
+        mask0, mask1 = None, None
+        c = max(m, n)
+        do_compile = self.static_lengths and c <= max(self.static_lengths)
+        if do_compile:
+            kn = min([k for k in self.static_lengths if k >= c])
+            desc0, mask0 = pad_to_length(desc0, kn)
+            desc1, mask1 = pad_to_length(desc1, kn)
+            kpts0, _ = pad_to_length(kpts0, kn)
+            kpts1, _ = pad_to_length(kpts1, kn)
+        desc0 = self.input_proj(desc0)
+        desc1 = self.input_proj(desc1)
+        # cache positional embeddings
+        encoding0 = self.posenc(kpts0)
+        encoding1 = self.posenc(kpts1)
+
+        # GNN + final_proj + assignment
+        do_early_stop = self.conf.depth_confidence > 0
+        do_point_pruning = self.conf.width_confidence > 0 and not do_compile
+        pruning_th = self.pruning_min_kpts(device)
+        if do_point_pruning:
+            ind0 = torch.arange(0, m, device=device)[None]
+            ind1 = torch.arange(0, n, device=device)[None]
+            # We store the index of the layer at which pruning is detected.
+            prune0 = torch.ones_like(ind0)
+            prune1 = torch.ones_like(ind1)
+        token0, token1 = None, None
+        for i in range(self.conf.n_layers):
+            desc0, desc1 = self.transformers[i](
+                desc0, desc1, encoding0, encoding1, mask0=mask0, mask1=mask1)
+            if i == self.conf.n_layers - 1:
+                continue  # no early stopping or adaptive width at last layer
+
+            if do_early_stop:
+                token0, token1 = self.token_confidence[i](desc0, desc1)
+                if self.check_if_stop(token0[..., :m, :], token1[..., :n, :],
+                                      i, m + n):
+                    break
+            if do_point_pruning and desc0.shape[-2] > pruning_th:
+                scores0 = self.log_assignment[i].get_matchability(desc0)
+                prunemask0 = self.get_pruning_mask(token0, scores0, i)
+                keep0 = torch.where(prunemask0)[1]
+                ind0 = ind0.index_select(1, keep0)
+                desc0 = desc0.index_select(1, keep0)
+                encoding0 = encoding0.index_select(-2, keep0)
+                prune0[:, ind0] += 1
+            if do_point_pruning and desc1.shape[-2] > pruning_th:
+                scores1 = self.log_assignment[i].get_matchability(desc1)
+                prunemask1 = self.get_pruning_mask(token1, scores1, i)
+                keep1 = torch.where(prunemask1)[1]
+                ind1 = ind1.index_select(1, keep1)
+                desc1 = desc1.index_select(1, keep1)
+                encoding1 = encoding1.index_select(-2, keep1)
+                prune1[:, ind1] += 1
+
+        desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :]
+        scores, _ = self.log_assignment[i](desc0, desc1)
+        m0, m1, mscores0, mscores1 = filter_matches(scores,
+                                                    self.conf.filter_threshold)
+        matches, mscores = [], []
+        for k in range(b):
+            valid = m0[k] > -1
+            m_indices_0 = torch.where(valid)[0]
+            m_indices_1 = m0[k][valid]
+            if do_point_pruning:
+                m_indices_0 = ind0[k, m_indices_0]
+                m_indices_1 = ind1[k, m_indices_1]
+            matches.append(torch.stack([m_indices_0, m_indices_1], -1))
+            mscores.append(mscores0[k][valid])
+
+        # TODO: Remove when hloc switches to the compact format.
+        if do_point_pruning:
+            m0_ = torch.full((b, m), -1, device=m0.device, dtype=m0.dtype)
+            m1_ = torch.full((b, n), -1, device=m1.device, dtype=m1.dtype)
+            m0_[:, ind0] = torch.where(m0 == -1, -1,
+                                       ind1.gather(1, m0.clamp(min=0)))
+            m1_[:, ind1] = torch.where(m1 == -1, -1,
+                                       ind0.gather(1, m1.clamp(min=0)))
+            mscores0_ = torch.zeros((b, m), device=mscores0.device)
+            mscores1_ = torch.zeros((b, n), device=mscores1.device)
+            mscores0_[:, ind0] = mscores0
+            mscores1_[:, ind1] = mscores1
+            m0, m1, mscores0, mscores1 = m0_, m1_, mscores0_, mscores1_
+        else:
+            prune0 = torch.ones_like(mscores0) * self.conf.n_layers
+            prune1 = torch.ones_like(mscores1) * self.conf.n_layers
+
+        pred = {
+            'matches0': m0,
+            'matches1': m1,
+            'matching_scores0': mscores0,
+            'matching_scores1': mscores1,
+            'stop': i + 1,
+            'matches': matches,
+            'scores': mscores,
+            'prune0': prune0,
+            'prune1': prune1,
+        }
+
+        return pred
+
+    def confidence_threshold(self, layer_index: int) -> float:
+        """scaled confidence threshold"""
+        threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.conf.n_layers)
+        return np.clip(threshold, 0, 1)
+
+    def get_pruning_mask(self, confidences: torch.Tensor, scores: torch.Tensor,
+                         layer_index: int) -> torch.Tensor:
+        """mask points which should be removed"""
+        keep = scores > (1 - self.conf.width_confidence)
+        if confidences is not None:  # Low-confidence points are never pruned.
+            keep |= confidences <= self.confidence_thresholds[layer_index]
+        return keep
+
+    def check_if_stop(
+        self,
+        confidences0: torch.Tensor,
+        confidences1: torch.Tensor,
+        layer_index: int,
+        num_points: int,
+    ) -> torch.Tensor:
+        """evaluate stopping condition"""
+        confidences = torch.cat([confidences0, confidences1], -1)
+        threshold = self.confidence_thresholds[layer_index]
+        ratio_confident = 1.0 - (
+            confidences < threshold).float().sum() / num_points  # noqa E501
+        return ratio_confident > self.conf.depth_confidence
+
+    def pruning_min_kpts(self, device: torch.device):
+        if self.conf.flash and FLASH_AVAILABLE and device.type == 'cuda':
+            return self.pruning_keypoint_thresholds['flash']
+        else:
+            return self.pruning_keypoint_thresholds[device.type]
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/sift.py b/modelscope/models/cv/image_matching_fast/lightglue/sift.py
new file mode 100644
index 000000000..435d8f7f5
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/sift.py
@@ -0,0 +1,221 @@
+import warnings
+
+import cv2
+import numpy as np
+import torch
+from kornia.color import rgb_to_grayscale
+from packaging import version
+
+from .utils import Extractor
+
+try:
+    import pycolmap
+except ImportError:
+    pycolmap = None
+
+
+def filter_dog_point(points,
+                     scales,
+                     angles,
+                     image_shape,
+                     nms_radius,
+                     scores=None):
+    h, w = image_shape
+    ij = np.round(points - 0.5).astype(int).T[::-1]
+
+    # Remove duplicate points (identical coordinates).
+    # Pick highest scale or score
+    s = scales if scores is None else scores
+    buffer = np.zeros((h, w))
+    np.maximum.at(buffer, tuple(ij), s)
+    keep = np.where(buffer[tuple(ij)] == s)[0]
+
+    # Pick lowest angle (arbitrary).
+    ij = ij[:, keep]
+    buffer[:] = np.inf
+    o_abs = np.abs(angles[keep])
+    np.minimum.at(buffer, tuple(ij), o_abs)
+    mask = buffer[tuple(ij)] == o_abs
+    ij = ij[:, mask]
+    keep = keep[mask]
+
+    if nms_radius > 0:
+        # Apply NMS on the remaining points
+        buffer[:] = 0
+        buffer[tuple(ij)] = s[keep]  # scores or scale
+
+        local_max = torch.nn.functional.max_pool2d(
+            torch.from_numpy(buffer).unsqueeze(0),
+            kernel_size=nms_radius * 2 + 1,
+            stride=1,
+            padding=nms_radius,
+        ).squeeze(0)
+        is_local_max = buffer == local_max.numpy()
+        keep = keep[is_local_max[tuple(ij)]]
+    return keep
+
+
+def sift_to_rootsift(x: torch.Tensor, eps=1e-6) -> torch.Tensor:
+    x = torch.nn.functional.normalize(x, p=1, dim=-1, eps=eps)
+    x.clip_(min=eps).sqrt_()
+    return torch.nn.functional.normalize(x, p=2, dim=-1, eps=eps)
+
+
+def run_opencv_sift(features: cv2.Feature2D, image: np.ndarray) -> np.ndarray:
+    """
+    Detect keypoints using OpenCV Detector.
+    Optionally, perform description.
+    Args:
+        features: OpenCV based keypoints detector and descriptor
+        image: Grayscale image of uint8 data type
+    Returns:
+        keypoints: 1D array of detected cv2.KeyPoint
+        scores: 1D array of responses
+        descriptors: 1D array of descriptors
+    """
+    detections, descriptors = features.detectAndCompute(image, None)
+    points = np.array([k.pt for k in detections], dtype=np.float32)
+    scores = np.array([k.response for k in detections], dtype=np.float32)
+    scales = np.array([k.size for k in detections], dtype=np.float32)
+    angles = np.deg2rad(
+        np.array([k.angle for k in detections], dtype=np.float32))
+    return points, scores, scales, angles, descriptors
+
+
+class SIFT(Extractor):
+    default_conf = {
+        'rootsift': True,
+        'nms_radius': 0,  # None to disable filtering entirely.
+        'max_num_keypoints': 4096,
+        'backend':
+        'opencv',  # in {opencv, pycolmap, pycolmap_cpu, pycolmap_cuda}
+        'detection_threshold': 0.0066667,  # from COLMAP
+        'edge_threshold': 10,
+        'first_octave': -1,  # only used by pycolmap, the default of COLMAP
+        'num_octaves': 4,
+    }
+
+    preprocess_conf = {
+        'resize': 1024,
+    }
+
+    required_data_keys = ['image']
+
+    def __init__(self, **conf):
+        super().__init__(**conf)  # Update with default configuration.
+        backend = self.conf.backend
+        if backend.startswith('pycolmap'):
+            if pycolmap is None:
+                raise ImportError(
+                    'Cannot find module pycolmap: install it with pip'
+                    'or use backend=opencv.')
+            options = {
+                'peak_threshold': self.conf.detection_threshold,
+                'edge_threshold': self.conf.edge_threshold,
+                'first_octave': self.conf.first_octave,
+                'num_octaves': self.conf.num_octaves,
+                'normalization':
+                pycolmap.Normalization.L2,  # L1_ROOT is buggy.
+            }
+            device = ('auto' if backend == 'pycolmap' else backend.replace(
+                'pycolmap_', ''))
+            if (backend == 'pycolmap_cpu' or not pycolmap.has_cuda
+                ) and pycolmap.__version__ < '0.5.0':  # noqa E501
+                warnings.warn(
+                    'The pycolmap CPU SIFT is buggy in version < 0.5.0, '
+                    'consider upgrading pycolmap or use the CUDA version.',
+                    stacklevel=1,
+                )
+            else:
+                options['max_num_features'] = self.conf.max_num_keypoints
+            self.sift = pycolmap.Sift(options=options, device=device)
+        elif backend == 'opencv':
+            self.sift = cv2.SIFT_create(
+                contrastThreshold=self.conf.detection_threshold,
+                nfeatures=self.conf.max_num_keypoints,
+                edgeThreshold=self.conf.edge_threshold,
+                nOctaveLayers=self.conf.num_octaves,
+            )
+        else:
+            backends = {'opencv', 'pycolmap', 'pycolmap_cpu', 'pycolmap_cuda'}
+            raise ValueError(f'Unknown backend: {backend} not in '
+                             f"{{{','.join(backends)}}}.")
+
+    def extract_single_image(self, image: torch.Tensor):
+        image_np = image.cpu().numpy().squeeze(0)
+
+        if self.conf.backend.startswith('pycolmap'):
+            if version.parse(pycolmap.__version__) >= version.parse('0.5.0'):
+                detections, descriptors = self.sift.extract(image_np)
+                scores = None  # Scores are not exposed by COLMAP anymore.
+            else:
+                detections, scores, descriptors = self.sift.extract(image_np)
+            keypoints = detections[:, :2]  # Keep only (x, y).
+            scales, angles = detections[:, -2:].T
+            if scores is not None and (self.conf.backend == 'pycolmap_cpu'
+                                       or not pycolmap.has_cuda):
+                # Set the scores as a combination of abs. response and scale.
+                scores = np.abs(scores) * scales
+        elif self.conf.backend == 'opencv':
+            # TODO: Check if opencv keypoints are already in corner convention
+            keypoints, scores, scales, angles, descriptors = run_opencv_sift(
+                self.sift, (image_np * 255.0).astype(np.uint8))
+        pred = {
+            'keypoints': keypoints,
+            'scales': scales,
+            'oris': angles,
+            'descriptors': descriptors,
+        }
+        if scores is not None:
+            pred['keypoint_scores'] = scores
+
+        # sometimes pycolmap returns points outside the image. We remove them
+        if self.conf.backend.startswith('pycolmap'):
+            is_inside = (pred['keypoints'] + 0.5 < np.array(
+                [image_np.shape[-2:][::-1]])).all(-1)
+            pred = {k: v[is_inside] for k, v in pred.items()}
+
+        if self.conf.nms_radius is not None:
+            keep = filter_dog_point(
+                pred['keypoints'],
+                pred['scales'],
+                pred['oris'],
+                image_np.shape,
+                self.conf.nms_radius,
+                scores=pred.get('keypoint_scores'),
+            )
+            pred = {k: v[keep] for k, v in pred.items()}
+
+        pred = {k: torch.from_numpy(v) for k, v in pred.items()}
+        if scores is not None:
+            # Keep the k keypoints with highest score
+            num_points = self.conf.max_num_keypoints
+            if num_points is not None and len(pred['keypoints']) > num_points:
+                indices = torch.topk(pred['keypoint_scores'],
+                                     num_points).indices
+                pred = {k: v[indices] for k, v in pred.items()}
+
+        return pred
+
+    def forward(self, data: dict) -> dict:
+        image = data['image']
+        if image.shape[1] == 3:
+            image = rgb_to_grayscale(image)
+        device = image.device
+        image = image.cpu()
+        pred = []
+        for k in range(len(image)):
+            img = image[k]
+            if 'image_size' in data.keys():
+                # avoid extracting points in padded areas
+                w, h = data['image_size'][k]
+                img = img[:, :h, :w]
+            p = self.extract_single_image(img)
+            pred.append(p)
+        pred = {
+            k: torch.stack([p[k] for p in pred], 0).to(device)
+            for k in pred[0]
+        }
+        if self.conf.rootsift:
+            pred['descriptors'] = sift_to_rootsift(pred['descriptors'])
+        return pred
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py b/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py
new file mode 100644
index 000000000..0f628458f
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py
@@ -0,0 +1,223 @@
+# %BANNER_BEGIN%
+# ---------------------------------------------------------------------
+# %COPYRIGHT_BEGIN%
+#
+#  Magic Leap, Inc. ("COMPANY") CONFIDENTIAL
+#
+#  Unpublished Copyright (c) 2020
+#  Magic Leap, Inc., All Rights Reserved.
+#
+# NOTICE:  All information contained herein is, and remains the property
+# of COMPANY. The intellectual and technical concepts contained herein
+# are proprietary to COMPANY and may be covered by U.S. and Foreign
+# Patents, patents in process, and are protected by trade secret or
+# copyright law.  Dissemination of this information or reproduction of
+# this material is strictly forbidden unless prior written permission is
+# obtained from COMPANY.  Access to the source code contained herein is
+# hereby forbidden to anyone except current COMPANY employees, managers
+# or contractors who have executed Confidentiality and Non-disclosure
+# agreements explicitly covering such access.
+#
+# The copyright notice above does not evidence any actual or intended
+# publication or disclosure  of  this source code, which includes
+# information that is confidential and/or proprietary, and is a trade
+# secret, of  COMPANY.   ANY REPRODUCTION, MODIFICATION, DISTRIBUTION,
+# PUBLIC  PERFORMANCE, OR PUBLIC DISPLAY OF OR THROUGH USE  OF THIS
+# SOURCE CODE  WITHOUT THE EXPRESS WRITTEN CONSENT OF COMPANY IS
+# STRICTLY PROHIBITED, AND IN VIOLATION OF APPLICABLE LAWS AND
+# INTERNATIONAL TREATIES.  THE RECEIPT OR POSSESSION OF  THIS SOURCE
+# CODE AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS
+# TO REPRODUCE, DISCLOSE OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE,
+# USE, OR SELL ANYTHING THAT IT  MAY DESCRIBE, IN WHOLE OR IN PART.
+#
+# %COPYRIGHT_END%
+# ----------------------------------------------------------------------
+# %AUTHORS_BEGIN%
+#
+#  Originating Authors: Paul-Edouard Sarlin
+#
+# %AUTHORS_END%
+# --------------------------------------------------------------------*/
+# %BANNER_END%
+
+# Adapted by Remi Pautrat, Philipp Lindenberger
+
+import os.path as osp
+
+import torch
+from kornia.color import rgb_to_grayscale
+from torch import nn
+
+from .utils import Extractor
+
+
+def simple_nms(scores, nms_radius: int):
+    """Fast Non-maximum suppression to remove nearby points"""
+    assert nms_radius >= 0
+
+    def max_pool(x):
+        return torch.nn.functional.max_pool2d(
+            x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius)
+
+    zeros = torch.zeros_like(scores)
+    max_mask = scores == max_pool(scores)
+    for _ in range(2):
+        supp_mask = max_pool(max_mask.float()) > 0
+        supp_scores = torch.where(supp_mask, zeros, scores)
+        new_max_mask = supp_scores == max_pool(supp_scores)
+        max_mask = max_mask | (new_max_mask & (~supp_mask))
+    return torch.where(max_mask, scores, zeros)
+
+
+def top_k_keypoints(keypoints, scores, k):
+    if k >= len(keypoints):
+        return keypoints, scores
+    scores, indices = torch.topk(scores, k, dim=0, sorted=True)
+    return keypoints[indices], scores
+
+
+def sample_descriptors(keypoints, descriptors, s: int = 8):
+    """Interpolate descriptors at keypoint locations"""
+    b, c, h, w = descriptors.shape
+    keypoints = keypoints - s / 2 + 0.5
+    keypoints /= torch.tensor([(w * s - s / 2 - 0.5),
+                               (h * s - s / 2 - 0.5)], ).to(keypoints)[None]
+    keypoints = keypoints * 2 - 1  # normalize to (-1, 1)
+    args = {'align_corners': True} if torch.__version__ >= '1.3' else {}
+    descriptors = torch.nn.functional.grid_sample(
+        descriptors, keypoints.view(b, 1, -1, 2), mode='bilinear', **args)
+    descriptors = torch.nn.functional.normalize(
+        descriptors.reshape(b, c, -1), p=2, dim=1)
+    return descriptors
+
+
+class SuperPoint(Extractor):
+    """SuperPoint Convolutional Detector and Descriptor
+
+    SuperPoint: Self-Supervised Interest Point Detection and
+    Description. Daniel DeTone, Tomasz Malisiewicz, and Andrew
+    Rabinovich. In CVPRW, 2019. https://arxiv.org/abs/1712.07629
+
+    """
+
+    default_conf = {
+        'descriptor_dim': 256,
+        'nms_radius': 4,
+        'max_num_keypoints': None,
+        'detection_threshold': 0.0005,
+        'remove_borders': 4,
+    }
+
+    preprocess_conf = {
+        'resize': 1024,
+    }
+
+    required_data_keys = ['image']
+
+    def __init__(self, model_dir, **conf):
+        super().__init__(**conf)  # Update with default configuration.
+        self.relu = nn.ReLU(inplace=True)
+        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
+        c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256
+
+        self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1)
+        self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1)
+        self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1)
+        self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1)
+        self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1)
+        self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1)
+        self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1)
+        self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1)
+
+        self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
+        self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0)
+
+        self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1)
+        self.convDb = nn.Conv2d(
+            c5, self.conf.descriptor_dim, kernel_size=1, stride=1, padding=0)
+
+        weights_path = osp.join(model_dir, 'superpoint_v1.pth')
+        self.load_state_dict(torch.load(weights_path, map_location='cpu'))
+
+        if self.conf.max_num_keypoints is not None and self.conf.max_num_keypoints <= 0:
+            raise ValueError('max_num_keypoints must be positive or None')
+
+    def forward(self, data: dict) -> dict:
+        """Compute keypoints, scores, descriptors for image"""
+        for key in self.required_data_keys:
+            assert key in data, f'Missing key {key} in data'
+        image = data['image']
+        if image.shape[1] == 3:
+            image = rgb_to_grayscale(image)
+
+        # Shared Encoder
+        x = self.relu(self.conv1a(image))
+        x = self.relu(self.conv1b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv2a(x))
+        x = self.relu(self.conv2b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv3a(x))
+        x = self.relu(self.conv3b(x))
+        x = self.pool(x)
+        x = self.relu(self.conv4a(x))
+        x = self.relu(self.conv4b(x))
+
+        # Compute the dense keypoint scores
+        cPa = self.relu(self.convPa(x))
+        scores = self.convPb(cPa)
+        scores = torch.nn.functional.softmax(scores, 1)[:, :-1]
+        b, _, h, w = scores.shape
+        scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8)
+        scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8)
+        scores = simple_nms(scores, self.conf.nms_radius)
+
+        # Discard keypoints near the image borders
+        if self.conf.remove_borders:
+            pad = self.conf.remove_borders
+            scores[:, :pad] = -1
+            scores[:, :, :pad] = -1
+            scores[:, -pad:] = -1
+            scores[:, :, -pad:] = -1
+
+        # Extract keypoints
+        best_kp = torch.where(scores > self.conf.detection_threshold)
+        scores = scores[best_kp]
+
+        # Separate into batches
+        keypoints = [
+            torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i]
+            for i in range(b)
+        ]
+        scores = [scores[best_kp[0] == i] for i in range(b)]
+
+        # Keep the k keypoints with highest score
+        if self.conf.max_num_keypoints is not None:
+            keypoints, scores = list(
+                zip(*[
+                    top_k_keypoints(k, s, self.conf.max_num_keypoints)
+                    for k, s in zip(keypoints, scores)
+                ]))
+
+        # Convert (h, w) to (x, y)
+        keypoints = [torch.flip(k, [1]).float() for k in keypoints]
+
+        # Compute the dense descriptors
+        cDa = self.relu(self.convDa(x))
+        descriptors = self.convDb(cDa)
+        descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1)
+
+        # Extract descriptors
+        descriptors = [
+            sample_descriptors(k[None], d[None], 8)[0]
+            for k, d in zip(keypoints, descriptors)
+        ]
+
+        return {
+            'keypoints':
+            torch.stack(keypoints, 0),
+            'keypoint_scores':
+            torch.stack(scores, 0),
+            'descriptors':
+            torch.stack(descriptors, 0).transpose(-1, -2).contiguous(),
+        }
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/utils.py b/modelscope/models/cv/image_matching_fast/lightglue/utils.py
new file mode 100644
index 000000000..86621e170
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/utils.py
@@ -0,0 +1,172 @@
+import collections.abc as collections
+from pathlib import Path
+from types import SimpleNamespace
+from typing import Callable, List, Optional, Tuple, Union
+
+import cv2
+import kornia
+import numpy as np
+import torch
+
+
+class ImagePreprocessor:
+    default_conf = {
+        'resize': None,  # target edge length, None for no resizing
+        'side': 'long',
+        'interpolation': 'bilinear',
+        'align_corners': None,
+        'antialias': True,
+    }
+
+    def __init__(self, **conf) -> None:
+        super().__init__()
+        self.conf = {**self.default_conf, **conf}
+        self.conf = SimpleNamespace(**self.conf)
+
+    def __call__(self, img: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Resize and preprocess an image, return image and resize scale"""
+        h, w = img.shape[-2:]
+        if self.conf.resize is not None:
+            img = kornia.geometry.transform.resize(
+                img,
+                self.conf.resize,
+                side=self.conf.side,
+                antialias=self.conf.antialias,
+                align_corners=self.conf.align_corners,
+            )
+        scale = torch.Tensor([img.shape[-1] / w, img.shape[-2] / h]).to(img)
+        return img, scale
+
+
+def map_tensor(input_, func: Callable):
+    string_classes = (str, bytes)
+    if isinstance(input_, string_classes):
+        return input_
+    elif isinstance(input_, collections.Mapping):
+        return {k: map_tensor(sample, func) for k, sample in input_.items()}
+    elif isinstance(input_, collections.Sequence):
+        return [map_tensor(sample, func) for sample in input_]
+    elif isinstance(input_, torch.Tensor):
+        return func(input_)
+    else:
+        return input_
+
+
+def batch_to_device(batch: dict,
+                    device: str = 'cpu',
+                    non_blocking: bool = True):
+    """Move batch (dict) to device"""
+
+    def _func(tensor):
+        return tensor.to(device=device, non_blocking=non_blocking).detach()
+
+    return map_tensor(batch, _func)
+
+
+def rbd(data: dict) -> dict:
+    """Remove batch dimension from elements in data"""
+    return {
+        k: v[0] if isinstance(v, (torch.Tensor, np.ndarray, list)) else v
+        for k, v in data.items()
+    }
+
+
+def read_image(path: Path, grayscale: bool = False) -> np.ndarray:
+    """Read an image from path as RGB or grayscale"""
+    if not Path(path).exists():
+        raise FileNotFoundError(f'No image at path {path}.')
+    mode = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR
+    image = cv2.imread(str(path), mode)
+    if image is None:
+        raise IOError(f'Could not read image at {path}.')
+    if not grayscale:
+        image = image[..., ::-1]
+    return image
+
+
+def numpy_image_to_torch(image: np.ndarray) -> torch.Tensor:
+    """Normalize the image tensor and reorder the dimensions."""
+    if image.ndim == 3:
+        image = image.transpose((2, 0, 1))  # HxWxC to CxHxW
+    elif image.ndim == 2:
+        image = image[None]  # add channel axis
+    else:
+        raise ValueError(f'Not an image: {image.shape}')
+    return torch.tensor(image / 255.0, dtype=torch.float)
+
+
+def resize_image(
+    image: np.ndarray,
+    size: Union[List[int], int],
+    fn: str = 'max',
+    interp: Optional[str] = 'area',
+) -> np.ndarray:
+    """Resize an image to a fixed size, or according to max or min edge."""
+    h, w = image.shape[:2]
+
+    fn = {'max': max, 'min': min}[fn]
+    if isinstance(size, int):
+        scale = size / fn(h, w)
+        h_new, w_new = int(round(h * scale)), int(round(w * scale))
+        scale = (w_new / w, h_new / h)
+    elif isinstance(size, (tuple, list)):
+        h_new, w_new = size
+        scale = (w_new / w, h_new / h)
+    else:
+        raise ValueError(f'Incorrect new size: {size}')
+    mode = {
+        'linear': cv2.INTER_LINEAR,
+        'cubic': cv2.INTER_CUBIC,
+        'nearest': cv2.INTER_NEAREST,
+        'area': cv2.INTER_AREA,
+    }[interp]
+    return cv2.resize(image, (w_new, h_new), interpolation=mode), scale
+
+
+def load_image(path: Path, resize: int = None, **kwargs) -> torch.Tensor:
+    image = read_image(path)
+    if resize is not None:
+        image, _ = resize_image(image, resize, **kwargs)
+    return numpy_image_to_torch(image)
+
+
+class Extractor(torch.nn.Module):
+
+    def __init__(self, **conf):
+        super().__init__()
+        self.conf = SimpleNamespace(**{**self.default_conf, **conf})
+
+    @torch.no_grad()
+    def extract(self, img: torch.Tensor, **conf) -> dict:
+        """Perform extraction with online resizing"""
+        if img.dim() == 3:
+            img = img[None]  # add batch dim
+        assert img.dim() == 4 and img.shape[0] == 1
+        shape = img.shape[-2:][::-1]
+        img, scales = ImagePreprocessor(**{
+            **self.preprocess_conf,
+            **conf
+        })(
+            img)
+        feats = self.forward({'image': img})
+        feats['image_size'] = torch.tensor(shape)[None].to(img).float()
+        feats['keypoints'] = (feats['keypoints'] + 0.5) / scales[None] - 0.5
+        return feats
+
+
+def match_pair(
+    extractor,
+    matcher,
+    image0: torch.Tensor,
+    image1: torch.Tensor,
+    device: str = 'cpu',
+    **preprocess,
+):
+    """Match a pair of images (image0, image1) with an extractor and matcher"""
+    feats0 = extractor.extract(image0, **preprocess)
+    feats1 = extractor.extract(image1, **preprocess)
+    matches01 = matcher({'image0': feats0, 'image1': feats1})
+    data = [feats0, feats1, matches01]
+    # remove batch dim and move to target device
+    feats0, feats1, matches01 = [batch_to_device(rbd(x), device) for x in data]
+    return feats0, feats1, matches01
diff --git a/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py b/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py
new file mode 100644
index 000000000..13ea8a589
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py
@@ -0,0 +1,199 @@
+"""
+2D visualization primitives based on Matplotlib.
+1) Plot images with `plot_images`.
+2) Call `plot_keypoints` or `plot_matches` any number of times.
+3) Optionally: save a .png or .pdf plot (nice in papers!) with `save_plot`.
+"""
+
+import matplotlib
+import matplotlib.patheffects as path_effects
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+
+def cm_RdGn(x):
+    """Custom colormap: red (0) -> yellow (0.5) -> green (1)."""
+    x = np.clip(x, 0, 1)[..., None] * 2
+    c = x * np.array([[0, 1.0, 0]]) + (2 - x) * np.array([[1.0, 0, 0]])
+    return np.clip(c, 0, 1)
+
+
+def cm_BlRdGn(x_):
+    """Custom colormap: blue (-1) -> red (0.0) -> green (1)."""
+    x = np.clip(x_, 0, 1)[..., None] * 2
+    c = x * np.array([[0, 1.0, 0, 1.0]]) + (2 - x) * np.array(
+        [[1.0, 0, 0, 1.0]])
+
+    xn = -np.clip(x_, -1, 0)[..., None] * 2
+    cn = xn * np.array([[0, 0.1, 1, 1.0]]) + (2 - xn) * np.array(
+        [[1.0, 0, 0, 1.0]])
+    out = np.clip(np.where(x_[..., None] < 0, cn, c), 0, 1)
+    return out
+
+
+def cm_prune(x_):
+    """Custom colormap to visualize pruning"""
+    if isinstance(x_, torch.Tensor):
+        x_ = x_.cpu().numpy()
+    max_i = max(x_)
+    norm_x = np.where(x_ == max_i, -1, (x_ - 1) / 9)
+    return cm_BlRdGn(norm_x)
+
+
+def plot_images(imgs,
+                titles=None,
+                cmaps='gray',
+                dpi=100,
+                pad=0.5,
+                adaptive=True):
+    """Plot a set of images horizontally.
+    Args:
+        imgs: list of NumPy RGB (H, W, 3) or PyTorch RGB (3, H, W) or mono (H, W).
+        titles: a list of strings, as titles for each image.
+        cmaps: colormaps for monochrome images.
+        adaptive: whether the figure size should fit the image aspect ratios.
+    """
+    # conversion to (H, W, 3) for torch.Tensor
+    imgs = [
+        img.permute(1, 2, 0).cpu().numpy() if
+        (isinstance(img, torch.Tensor) and img.dim() == 3) else img
+        for img in imgs
+    ]
+
+    n = len(imgs)
+    if not isinstance(cmaps, (list, tuple)):
+        cmaps = [cmaps] * n
+
+    if adaptive:
+        ratios = [i.shape[1] / i.shape[0] for i in imgs]  # W / H
+    else:
+        ratios = [4 / 3] * n
+    figsize = [sum(ratios) * 4.5, 4.5]
+    fig, ax = plt.subplots(
+        1, n, figsize=figsize, dpi=dpi, gridspec_kw={'width_ratios': ratios})
+    if n == 1:
+        ax = [ax]
+    for i in range(n):
+        ax[i].imshow(imgs[i], cmap=plt.get_cmap(cmaps[i]))
+        ax[i].get_yaxis().set_ticks([])
+        ax[i].get_xaxis().set_ticks([])
+        ax[i].set_axis_off()
+        for spine in ax[i].spines.values():  # remove frame
+            spine.set_visible(False)
+        if titles:
+            ax[i].set_title(titles[i])
+    fig.tight_layout(pad=pad)
+
+
+def plot_keypoints(kpts, colors='lime', ps=4, axes=None, a=1.0):
+    """Plot keypoints for existing images.
+    Args:
+        kpts: list of ndarrays of size (N, 2).
+        colors: string, or list of list of tuples (one for each keypoints).
+        ps: size of the keypoints as float.
+    """
+    if not isinstance(colors, list):
+        colors = [colors] * len(kpts)
+    if not isinstance(a, list):
+        a = [a] * len(kpts)
+    if axes is None:
+        axes = plt.gcf().axes
+    for ax, k, c, alpha in zip(axes, kpts, colors, a):
+        if isinstance(k, torch.Tensor):
+            k = k.cpu().numpy()
+        ax.scatter(k[:, 0], k[:, 1], c=c, s=ps, linewidths=0, alpha=alpha)
+
+
+def plot_matches(kpts0,
+                 kpts1,
+                 color=None,
+                 lw=1.5,
+                 ps=4,
+                 a=1.0,
+                 labels=None,
+                 axes=None):
+    """Plot matches for a pair of existing images.
+    Args:
+        kpts0, kpts1: corresponding keypoints of size (N, 2).
+        color: color of each match, string or RGB tuple. Random if not given.
+        lw: width of the lines.
+        ps: size of the end points (no endpoint if ps=0)
+        indices: indices of the images to draw the matches on.
+        a: alpha opacity of the match lines.
+    """
+    fig = plt.gcf()
+    if axes is None:
+        ax = fig.axes
+        ax0, ax1 = ax[0], ax[1]
+    else:
+        ax0, ax1 = axes
+    if isinstance(kpts0, torch.Tensor):
+        kpts0 = kpts0.cpu().numpy()
+    if isinstance(kpts1, torch.Tensor):
+        kpts1 = kpts1.cpu().numpy()
+    assert len(kpts0) == len(kpts1)
+    if color is None:
+        color = matplotlib.cm.hsv(np.random.rand(len(kpts0))).tolist()
+    elif len(color) > 0 and not isinstance(color[0], (tuple, list)):
+        color = [color] * len(kpts0)
+
+    if lw > 0:
+        for i in range(len(kpts0)):
+            line = matplotlib.patches.ConnectionPatch(
+                xyA=(kpts0[i, 0], kpts0[i, 1]),
+                xyB=(kpts1[i, 0], kpts1[i, 1]),
+                coordsA=ax0.transData,
+                coordsB=ax1.transData,
+                axesA=ax0,
+                axesB=ax1,
+                zorder=1,
+                color=color[i],
+                linewidth=lw,
+                clip_on=True,
+                alpha=a,
+                label=None if labels is None else labels[i],
+                picker=5.0,
+            )
+            line.set_annotation_clip(True)
+            fig.add_artist(line)
+
+    # freeze the axes to prevent the transform to change
+    ax0.autoscale(enable=False)
+    ax1.autoscale(enable=False)
+
+    if ps > 0:
+        ax0.scatter(kpts0[:, 0], kpts0[:, 1], c=color, s=ps)
+        ax1.scatter(kpts1[:, 0], kpts1[:, 1], c=color, s=ps)
+
+
+def add_text(
+    idx,
+    text,
+    pos=(0.01, 0.99),
+    fs=15,
+    color='w',
+    lcolor='k',
+    lwidth=2,
+    ha='left',
+    va='top',
+):
+    ax = plt.gcf().axes[idx]
+    t = ax.text(
+        *pos,
+        text,
+        fontsize=fs,
+        ha=ha,
+        va=va,
+        color=color,
+        transform=ax.transAxes)
+    if lcolor is not None:
+        t.set_path_effects([
+            path_effects.Stroke(linewidth=lwidth, foreground=lcolor),
+            path_effects.Normal(),
+        ])
+
+
+def save_plot(path, **kw):
+    """Save the current figure without any white margin."""
+    plt.savefig(path, bbox_inches='tight', pad_inches=0, **kw)
diff --git a/modelscope/models/cv/image_matching_fast/lightglue_model.py b/modelscope/models/cv/image_matching_fast/lightglue_model.py
new file mode 100644
index 000000000..8043051c2
--- /dev/null
+++ b/modelscope/models/cv/image_matching_fast/lightglue_model.py
@@ -0,0 +1,98 @@
+# The implementation is made publicly available under the
+# Apache 2.0 license at https://github.com/cvg/LightGlue
+
+import os.path as osp
+from pathlib import Path
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+from .config.default import lightglue_default_conf
+from .lightglue import ALIKED, DISK, SIFT, LightGlue, SuperPoint
+from .lightglue.utils import numpy_image_to_torch, rbd
+
+
+@MODELS.register_module(
+    Tasks.image_matching, module_name=Models.lightglue_image_matching)
+class LightGlueImageMatching(TorchModel):
+    '''
+    LightGlue is an simple but effective enhancement of the state-of-the-art image matching method, SuperGlue.
+    For more details, please refer to https://arxiv.org/abs/2306.13643
+    '''
+
+    def __init__(self, model_dir: str, max_num_keypoints=2048, **kwargs):
+
+        super().__init__(model_dir, **kwargs)
+
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')  # 'mps', 'cpu'
+
+        features = lightglue_default_conf.get('features', 'superpoint')
+
+        if features == 'disk':
+            self.extractor = DISK(
+                max_num_keypoints=max_num_keypoints).eval().to(self.device)
+        elif features == 'aliked':
+            self.extractor = ALIKED(
+                max_num_keypoints=max_num_keypoints).eval().to(self.device)
+        elif features == 'sift':
+            self.extractor = SIFT(
+                max_num_keypoints=max_num_keypoints).eval().to(self.device)
+        else:
+            self.extractor = SuperPoint(
+                model_dir=model_dir,
+                max_num_keypoints=max_num_keypoints).eval().to(self.device)
+
+        self.matcher = LightGlue(
+            model_dir=model_dir,
+            default_conf=lightglue_default_conf).eval().to(self.device)
+
+    def forward(self, inputs):
+        '''
+        Args:
+            inputs: a dict with keys 'image0', 'image1'
+        '''
+
+        feats0 = self.extractor.extract(
+            numpy_image_to_torch(inputs['image0']).to(self.device))
+        feats1 = self.extractor.extract(
+            numpy_image_to_torch(inputs['image1']).to(self.device))
+        matches01 = self.matcher({'image0': feats0, 'image1': feats1})
+
+        return [feats0, feats1, matches01]
+
+    def postprocess(self, inputs):
+        '''
+        Args:
+            inputs: a list of feats0, feats1, matches01
+        '''
+        matching_result = inputs
+        feats0, feats1, matches01 = [rbd(x) for x in matching_result
+                                     ]  # remove batch dimension
+
+        kpts0, kpts1, matches = feats0['keypoints'], feats1[
+            'keypoints'], matches01['matches']
+        m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]]
+
+        # match confidence
+        confidence = matches01['scores']
+
+        matches_result = {
+            'kpts0': m_kpts0,
+            'kpts1': m_kpts1,
+            'confidence': confidence
+        }
+
+        results = {OutputKeys.MATCHES: matches_result}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+
+        return results
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/__init__.py
new file mode 100644
index 000000000..691834510
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .geomvsnet_model import GeoMVSNetDepthEstimation
+
+else:
+    _import_structure = {
+        'geomvsnet_model': ['GeoMVSNetDepthEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/colmap2mvsnet.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/colmap2mvsnet.py
new file mode 100644
index 000000000..37d92c13a
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/colmap2mvsnet.py
@@ -0,0 +1,472 @@
+# The implementation is borrowed from https://github.com/YoYo000/MVSNet. Model reading is provided by COLMAP.
+
+from __future__ import print_function
+import collections
+import multiprocessing as mp
+import os
+import shutil
+import struct
+from functools import partial
+
+import cv2
+import numpy as np
+
+# ============================ read_model.py ============================#
+CameraModel = collections.namedtuple('CameraModel',
+                                     ['model_id', 'model_name', 'num_params'])
+Camera = collections.namedtuple('Camera',
+                                ['id', 'model', 'width', 'height', 'params'])
+BaseImage = collections.namedtuple(
+    'Image', ['id', 'qvec', 'tvec', 'camera_id', 'name', 'xys', 'point3D_ids'])
+Point3D = collections.namedtuple(
+    'Point3D', ['id', 'xyz', 'rgb', 'error', 'image_ids', 'point2D_idxs'])
+
+
+class Image(BaseImage):
+
+    def qvec2rotmat(self):
+        return qvec2rotmat(self.qvec)
+
+
+CAMERA_MODELS = {
+    CameraModel(model_id=0, model_name='SIMPLE_PINHOLE', num_params=3),
+    CameraModel(model_id=1, model_name='PINHOLE', num_params=4),
+    CameraModel(model_id=2, model_name='SIMPLE_RADIAL', num_params=4),
+    CameraModel(model_id=3, model_name='RADIAL', num_params=5),
+    CameraModel(model_id=4, model_name='OPENCV', num_params=8),
+    CameraModel(model_id=5, model_name='OPENCV_FISHEYE', num_params=8),
+    CameraModel(model_id=6, model_name='FULL_OPENCV', num_params=12),
+    CameraModel(model_id=7, model_name='FOV', num_params=5),
+    CameraModel(model_id=8, model_name='SIMPLE_RADIAL_FISHEYE', num_params=4),
+    CameraModel(model_id=9, model_name='RADIAL_FISHEYE', num_params=5),
+    CameraModel(model_id=10, model_name='THIN_PRISM_FISHEYE', num_params=12)
+}
+CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model)
+                         for camera_model in CAMERA_MODELS])
+
+
+def read_next_bytes(fid,
+                    num_bytes,
+                    format_char_sequence,
+                    endian_character='<'):
+    """Read and unpack the next bytes from a binary file.
+    :param fid:
+    :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc.
+    :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}.
+    :param endian_character: Any of {@, =, <, >, !}
+    :return: Tuple of read and unpacked values.
+    """
+    data = fid.read(num_bytes)
+    return struct.unpack(endian_character + format_char_sequence, data)
+
+
+def read_cameras_text(path):
+    cameras = {}
+    with open(path, 'r', encoding='utf-8') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                camera_id = int(elems[0])
+                model = elems[1]
+                width = int(elems[2])
+                height = int(elems[3])
+                params = np.array(tuple(map(float, elems[4:])))
+                cameras[camera_id] = Camera(
+                    id=camera_id,
+                    model=model,
+                    width=width,
+                    height=height,
+                    params=params)
+    return cameras
+
+
+def read_cameras_binary(path_to_model_file):
+    cameras = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_cameras = read_next_bytes(fid, 8, 'Q')[0]
+        for camera_line_index in range(num_cameras):
+            camera_properties = read_next_bytes(
+                fid, num_bytes=24, format_char_sequence='iiQQ')
+            camera_id = camera_properties[0]
+            model_id = camera_properties[1]
+            model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name
+            width = camera_properties[2]
+            height = camera_properties[3]
+            num_params = CAMERA_MODEL_IDS[model_id].num_params
+            params = read_next_bytes(
+                fid,
+                num_bytes=8 * num_params,
+                format_char_sequence='d' * num_params)
+            cameras[camera_id] = Camera(
+                id=camera_id,
+                model=model_name,
+                width=width,
+                height=height,
+                params=np.array(params))
+        assert len(cameras) == num_cameras
+    return cameras
+
+
+def read_images_text(path):
+    images = {}
+    with open(path, 'r', encoding='utf-8') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                image_id = int(elems[0])
+                qvec = np.array(tuple(map(float, elems[1:5])))
+                tvec = np.array(tuple(map(float, elems[5:8])))
+                camera_id = int(elems[8])
+                image_name = elems[9]
+                elems = fid.readline().split()
+                xys = np.column_stack([
+                    tuple(map(float, elems[0::3])),
+                    tuple(map(float, elems[1::3]))
+                ])
+                point3D_ids = np.array(tuple(map(int, elems[2::3])))
+                images[image_id] = Image(
+                    id=image_id,
+                    qvec=qvec,
+                    tvec=tvec,
+                    camera_id=camera_id,
+                    name=image_name,
+                    xys=xys,
+                    point3D_ids=point3D_ids)
+    return images
+
+
+def read_images_binary(path_to_model_file):
+    images = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_reg_images = read_next_bytes(fid, 8, 'Q')[0]
+        for image_index in range(num_reg_images):
+            binary_image_properties = read_next_bytes(
+                fid, num_bytes=64, format_char_sequence='idddddddi')
+            image_id = binary_image_properties[0]
+            qvec = np.array(binary_image_properties[1:5])
+            tvec = np.array(binary_image_properties[5:8])
+            camera_id = binary_image_properties[8]
+            image_name = ''
+            current_char = read_next_bytes(fid, 1, 'c')[0]
+            while current_char != b'\x00':  # look for the ASCII 0 entry
+                image_name += current_char.decode('utf-8')
+                current_char = read_next_bytes(fid, 1, 'c')[0]
+            num_points2D = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence='Q')[0]
+            x_y_id_s = read_next_bytes(
+                fid,
+                num_bytes=24 * num_points2D,
+                format_char_sequence='ddq' * num_points2D)
+            xys = np.column_stack([
+                tuple(map(float, x_y_id_s[0::3])),
+                tuple(map(float, x_y_id_s[1::3]))
+            ])
+            point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3])))
+            images[image_id] = Image(
+                id=image_id,
+                qvec=qvec,
+                tvec=tvec,
+                camera_id=camera_id,
+                name=image_name,
+                xys=xys,
+                point3D_ids=point3D_ids)
+    return images
+
+
+def read_points3D_text(path):
+    points3D = {}
+    with open(path, 'r', encoding='utf-8') as fid:
+        while True:
+            line = fid.readline()
+            if not line:
+                break
+            line = line.strip()
+            if len(line) > 0 and line[0] != '#':
+                elems = line.split()
+                point3D_id = int(elems[0])
+                xyz = np.array(tuple(map(float, elems[1:4])))
+                rgb = np.array(tuple(map(int, elems[4:7])))
+                error = float(elems[7])
+                image_ids = np.array(tuple(map(int, elems[8::2])))
+                point2D_idxs = np.array(tuple(map(int, elems[9::2])))
+                points3D[point3D_id] = Point3D(
+                    id=point3D_id,
+                    xyz=xyz,
+                    rgb=rgb,
+                    error=error,
+                    image_ids=image_ids,
+                    point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def read_points3d_binary(path_to_model_file):
+    points3D = {}
+    with open(path_to_model_file, 'rb') as fid:
+        num_points = read_next_bytes(fid, 8, 'Q')[0]
+        for point_line_index in range(num_points):
+            binary_point_line_properties = read_next_bytes(
+                fid, num_bytes=43, format_char_sequence='QdddBBBd')
+            point3D_id = binary_point_line_properties[0]
+            xyz = np.array(binary_point_line_properties[1:4])
+            rgb = np.array(binary_point_line_properties[4:7])
+            error = np.array(binary_point_line_properties[7])
+            track_length = read_next_bytes(
+                fid, num_bytes=8, format_char_sequence='Q')[0]
+            track_elems = read_next_bytes(
+                fid,
+                num_bytes=8 * track_length,
+                format_char_sequence='ii' * track_length)
+            image_ids = np.array(tuple(map(int, track_elems[0::2])))
+            point2D_idxs = np.array(tuple(map(int, track_elems[1::2])))
+            points3D[point3D_id] = Point3D(
+                id=point3D_id,
+                xyz=xyz,
+                rgb=rgb,
+                error=error,
+                image_ids=image_ids,
+                point2D_idxs=point2D_idxs)
+    return points3D
+
+
+def read_model(path, ext):
+    if ext == '.txt':
+        cameras = read_cameras_text(os.path.join(path, 'cameras' + ext))
+        images = read_images_text(os.path.join(path, 'images' + ext))
+        points3D = read_points3D_text(os.path.join(path, 'points3D') + ext)
+    else:
+        cameras = read_cameras_binary(os.path.join(path, 'cameras' + ext))
+        images = read_images_binary(os.path.join(path, 'images' + ext))
+        points3D = read_points3d_binary(os.path.join(path, 'points3D') + ext)
+    return cameras, images, points3D
+
+
+def qvec2rotmat(qvec):
+    return np.array([
+        [
+            1 - 2 * qvec[2]**2 - 2 * qvec[3]**2,
+            2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3],
+            2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2]
+        ],  # noqa
+        [
+            2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3],
+            1 - 2 * qvec[1]**2 - 2 * qvec[3]**2,
+            2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1]
+        ],  # noqa
+        [
+            2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2],
+            2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1],
+            1 - 2 * qvec[1]**2 - 2 * qvec[2]**2
+        ]
+    ])  # noqa
+
+
+def rotmat2qvec(R):
+    Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat
+    K = np.array(
+        [[Rxx - Ryy - Rzz, 0, 0, 0], [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0],
+         [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0],
+         [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0  # noqa
+    eigvals, eigvecs = np.linalg.eigh(K)
+    qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)]
+    if qvec[0] < 0:
+        qvec *= -1
+    return qvec
+
+
+def calc_score(inputs, images, points3d, extrinsic, args):
+    i, j = inputs
+    id_i = images[i + 1].point3D_ids
+    id_j = images[j + 1].point3D_ids
+    id_intersect = [it for it in id_i if it in id_j]
+    cam_center_i = -np.matmul(extrinsic[i + 1][:3, :3].transpose(),
+                              extrinsic[i + 1][:3, 3:4])[:, 0]
+    cam_center_j = -np.matmul(extrinsic[j + 1][:3, :3].transpose(),
+                              extrinsic[j + 1][:3, 3:4])[:, 0]
+    score = 0
+    for pid in id_intersect:
+        if pid == -1:
+            continue
+        p = points3d[pid].xyz
+        theta = (180 / np.pi) * np.arccos(
+            np.dot(cam_center_i - p, cam_center_j - p)
+            / np.linalg.norm(cam_center_i - p)
+            / np.linalg.norm(cam_center_j - p))
+        tmp_value = (
+            2 *  # noqa
+            (args.sigma1 if theta <= args.theta0 else args.sigma2)**2)
+        score += np.exp(-(theta - args.theta0) *  # noqa
+                        (theta - args.theta0) / tmp_value)
+    return i, j, score
+
+
+def processing_single_scene(args):
+
+    image_dir = os.path.join(args.dense_folder, 'images')
+    model_dir = os.path.join(args.dense_folder, 'sparse')
+    cam_dir = os.path.join(args.save_folder, 'cams')
+    image_converted_dir = os.path.join(args.save_folder, 'images_post')
+
+    if os.path.exists(image_converted_dir):
+        shutil.rmtree(image_converted_dir)
+    os.makedirs(image_converted_dir)
+    if os.path.exists(cam_dir):
+        shutil.rmtree(cam_dir)
+
+    cameras, images, points3d = read_model(model_dir, args.model_ext)
+    num_images = len(list(images.items()))
+
+    param_type = {
+        'SIMPLE_PINHOLE': ['f', 'cx', 'cy'],
+        'PINHOLE': ['fx', 'fy', 'cx', 'cy'],
+        'SIMPLE_RADIAL': ['f', 'cx', 'cy', 'k'],
+        'SIMPLE_RADIAL_FISHEYE': ['f', 'cx', 'cy', 'k'],
+        'RADIAL': ['f', 'cx', 'cy', 'k1', 'k2'],
+        'RADIAL_FISHEYE': ['f', 'cx', 'cy', 'k1', 'k2'],
+        'OPENCV': ['fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2'],
+        'OPENCV_FISHEYE': ['fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'k3', 'k4'],
+        'FULL_OPENCV': [
+            'fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2', 'k3', 'k4', 'k5',
+            'k6'
+        ],
+        'FOV': ['fx', 'fy', 'cx', 'cy', 'omega'],
+        'THIN_PRISM_FISHEYE': [
+            'fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2', 'k3', 'k4', 'sx1',
+            'sy1'
+        ]
+    }
+
+    # intrinsic
+    intrinsic = {}
+    for camera_id, cam in cameras.items():
+        params_dict = {
+            key: value
+            for key, value in zip(param_type[cam.model], cam.params)
+        }
+        if 'f' in param_type[cam.model]:
+            params_dict['fx'] = params_dict['f']
+            params_dict['fy'] = params_dict['f']
+        i = np.array([[params_dict['fx'], 0, params_dict['cx']],
+                      [0, params_dict['fy'], params_dict['cy']], [0, 0, 1]])
+        intrinsic[camera_id] = i
+
+    new_images = {}
+    for i, image_id in enumerate(sorted(images.keys())):
+        new_images[i + 1] = images[image_id]
+    images = new_images
+
+    # extrinsic
+    extrinsic = {}
+    for image_id, image in images.items():
+        e = np.zeros((4, 4))
+        e[:3, :3] = qvec2rotmat(image.qvec)
+        e[:3, 3] = image.tvec
+        e[3, 3] = 1
+        extrinsic[image_id] = e
+
+    # depth range and interval
+    depth_ranges = {}
+    for i in range(num_images):
+        zs = []
+        for p3d_id in images[i + 1].point3D_ids:
+            if p3d_id == -1:
+                continue
+            transformed = np.matmul(extrinsic[i + 1], [
+                points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1],
+                points3d[p3d_id].xyz[2], 1
+            ])
+            zs.append(transformed[2].item())
+        zs_sorted = sorted(zs)
+        # relaxed depth range
+        max_ratio = 0.1
+        min_ratio = 0.03
+        num_max = max(5, int(len(zs) * max_ratio))
+        num_min = max(1, int(len(zs) * min_ratio))
+        depth_min = 1.0 * sum(zs_sorted[:num_min]) / len(zs_sorted[:num_min])
+        depth_max = 1.0 * sum(zs_sorted[-num_max:]) / len(zs_sorted[-num_max:])
+        if args.max_d == 0:
+            image_int = intrinsic[images[i + 1].camera_id]
+            image_ext = extrinsic[i + 1]
+            image_r = image_ext[0:3, 0:3]
+            image_t = image_ext[0:3, 3]
+            p1 = [image_int[0, 2], image_int[1, 2], 1]
+            p2 = [image_int[0, 2] + 1, image_int[1, 2], 1]
+            P1 = np.matmul(np.linalg.inv(image_int), p1) * depth_min
+            P1 = np.matmul(np.linalg.inv(image_r), (P1 - image_t))
+            P2 = np.matmul(np.linalg.inv(image_int), p2) * depth_min
+            P2 = np.matmul(np.linalg.inv(image_r), (P2 - image_t))
+            depth_num = (1 / depth_min - 1 / depth_max) / (
+                1 / depth_min - 1 / (depth_min + np.linalg.norm(P2 - P1)))
+        else:
+            depth_num = args.max_d
+        depth_interval = (depth_max - depth_min) / (depth_num
+                                                    - 1) / args.interval_scale
+        depth_ranges[i + 1] = (depth_min, depth_interval, depth_num, depth_max)
+
+    # view selection
+    score = np.zeros((len(images), len(images)))
+    queue = []
+    for i in range(len(images)):
+        for j in range(i + 1, len(images)):
+            queue.append((i, j))
+
+    p = mp.Pool(processes=mp.cpu_count())
+    func = partial(
+        calc_score,
+        images=images,
+        points3d=points3d,
+        args=args,
+        extrinsic=extrinsic)
+    result = p.map(func, queue)
+    for i, j, s in result:
+        score[i, j] = s
+        score[j, i] = s
+    view_sel = []
+    for i in range(len(images)):
+        sorted_score = np.argsort(score[i])[::-1]
+        view_sel.append([(k, score[i, k]) for k in sorted_score[:10]])
+
+    # write
+    os.makedirs(cam_dir, exist_ok=True)
+
+    for i in range(num_images):
+        with open(os.path.join(cam_dir, '%08d_cam.txt' % i), 'w') as f:
+            f.write('extrinsic\n')
+            for j in range(4):
+                for k in range(4):
+                    f.write(str(extrinsic[i + 1][j, k]) + ' ')
+                f.write('\n')
+            f.write('\nintrinsic\n')
+            for j in range(3):
+                for k in range(3):
+                    f.write(
+                        str(intrinsic[images[i + 1].camera_id][j, k]) + ' ')
+                f.write('\n')
+            f.write('\n%f %f %f %f\n' %
+                    (depth_ranges[i + 1][0], depth_ranges[i + 1][1],
+                     depth_ranges[i + 1][2], depth_ranges[i + 1][3]))
+    with open(os.path.join(args.save_folder, 'pair.txt'), 'w') as f:
+        f.write('%d\n' % len(images))
+        for i, sorted_score in enumerate(view_sel):
+            f.write('%d\n%d ' % (i, len(sorted_score)))
+            for image_id, s in sorted_score:
+                f.write('%d %f ' % (image_id, s))
+            f.write('\n')
+
+    # convert to jpg
+    for i in range(num_images):
+        img_path = os.path.join(image_dir, images[i + 1].name)
+        if not img_path.endswith('.jpg'):
+            img = cv2.imread(img_path)
+            cv2.imwrite(os.path.join(image_converted_dir, '%08d.jpg' % i), img)
+        else:
+            shutil.copyfile(
+                os.path.join(image_dir, images[i + 1].name),
+                os.path.join(image_converted_dir, '%08d.jpg' % i))
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/depth_filter.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/depth_filter.py
new file mode 100644
index 000000000..05f1214a9
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/depth_filter.py
@@ -0,0 +1,249 @@
+# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch
+import os
+
+import cv2
+import numpy as np
+from PIL import Image
+from plyfile import PlyData, PlyElement
+
+from .general_eval_dataset import read_pfm
+
+
+# read intrinsics and extrinsics
+def read_camera_parameters(filename):
+    with open(filename) as f:
+        lines = f.readlines()
+        lines = [line.rstrip() for line in lines]
+    # extrinsics: line [1,5), 4x4 matrix
+    extrinsics = np.fromstring(
+        ' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+    # intrinsics: line [7-10), 3x3 matrix
+    intrinsics = np.fromstring(
+        ' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+    # assume the feature is 1/4 of the original image size
+    # intrinsics[:2, :] /= 4
+    return intrinsics, extrinsics
+
+
+# read an image
+def read_img(filename):
+    img = Image.open(filename)
+    # scale 0~255 to 0~1
+    np_img = np.array(img, dtype=np.float32) / 255.
+    return np_img
+
+
+# read a binary mask
+def read_mask(filename):
+    return read_img(filename) > 0.5
+
+
+# save a binary mask
+def save_mask(filename, mask):
+    assert mask.dtype == bool
+    mask = mask.astype(np.uint8) * 255
+    Image.fromarray(mask).save(filename)
+
+
+# read a pair file, [(ref_view1, [src_view1-1, ...]), (ref_view2, [src_view2-1, ...]), ...]
+def read_pair_file(filename):
+    data = []
+    with open(filename) as f:
+        num_viewpoint = int(f.readline())
+        # 49 viewpoints
+        for view_idx in range(num_viewpoint):
+            ref_view = int(f.readline().rstrip())
+            src_views = [int(x) for x in f.readline().rstrip().split()[1::2]]
+            if len(src_views) > 0:
+                data.append((ref_view, src_views))
+    return data
+
+
+# project the reference point cloud into the source view, then project back
+def reproject_with_depth(depth_ref, intrinsics_ref, extrinsics_ref, depth_src,
+                         intrinsics_src, extrinsics_src):
+    width, height = depth_ref.shape[1], depth_ref.shape[0]
+    # step1. project reference pixels to the source view
+    # reference view x, y
+    x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
+    x_ref, y_ref = x_ref.reshape([-1]), y_ref.reshape([-1])
+    # reference 3D space
+    xyz_ref = np.matmul(
+        np.linalg.inv(intrinsics_ref),
+        np.vstack(
+            (x_ref, y_ref, np.ones_like(x_ref))) * depth_ref.reshape([-1]))
+    # source 3D space
+    xyz_src = np.matmul(
+        np.matmul(extrinsics_src, np.linalg.inv(extrinsics_ref)),
+        np.vstack((xyz_ref, np.ones_like(x_ref))))[:3]
+    # source view x, y
+    K_xyz_src = np.matmul(intrinsics_src, xyz_src)
+    xy_src = K_xyz_src[:2] / K_xyz_src[2:3]
+
+    # step2. reproject the source view points with source view depth estimation
+    # find the depth estimation of the source view
+    x_src = xy_src[0].reshape([height, width]).astype(np.float32)
+    y_src = xy_src[1].reshape([height, width]).astype(np.float32)
+    sampled_depth_src = cv2.remap(
+        depth_src, x_src, y_src, interpolation=cv2.INTER_LINEAR)
+
+    # source 3D space
+    # NOTE that we should use sampled source-view depth_here to project back
+    xyz_src = np.matmul(
+        np.linalg.inv(intrinsics_src),
+        np.vstack(
+            (xy_src, np.ones_like(x_ref))) * sampled_depth_src.reshape([-1]))
+    # reference 3D space
+    xyz_reprojected = np.matmul(
+        np.matmul(extrinsics_ref, np.linalg.inv(extrinsics_src)),
+        np.vstack((xyz_src, np.ones_like(x_ref))))[:3]
+    # source view x, y, depth
+    depth_reprojected = xyz_reprojected[2].reshape([height,
+                                                    width]).astype(np.float32)
+    K_xyz_reprojected = np.matmul(intrinsics_ref, xyz_reprojected)
+    xy_reprojected = K_xyz_reprojected[:2] / K_xyz_reprojected[2:3]
+    x_reprojected = xy_reprojected[0].reshape([height,
+                                               width]).astype(np.float32)
+    y_reprojected = xy_reprojected[1].reshape([height,
+                                               width]).astype(np.float32)
+
+    return depth_reprojected, x_reprojected, y_reprojected, x_src, y_src
+
+
+def check_geometric_consistency(depth_ref, intrinsics_ref, extrinsics_ref,
+                                depth_src, intrinsics_src, extrinsics_src):
+    width, height = depth_ref.shape[1], depth_ref.shape[0]
+    x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height))
+    depth_reprojected, x2d_reprojected, y2d_reprojected, x2d_src, y2d_src = reproject_with_depth(
+        depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src,
+        extrinsics_src)
+    # check |p_reproj-p_1| < 1
+    dist = np.sqrt((x2d_reprojected - x_ref)**2 + (y2d_reprojected - y_ref)**2)
+
+    # check |d_reproj-d_1| / d_1 < 0.01
+    depth_diff = np.abs(depth_reprojected - depth_ref)
+    relative_depth_diff = depth_diff / depth_ref
+
+    mask = np.logical_and(dist < 1, relative_depth_diff < 0.01)
+    depth_reprojected[~mask] = 0
+
+    return mask, depth_reprojected, x2d_src, y2d_src
+
+
+def filter_depth(pair_folder, scan_folder, out_folder, thres_view):
+    # the pair file
+    pair_file = os.path.join(pair_folder, 'pair.txt')
+    # for the final point cloud
+    vertexs = []
+    vertex_colors = []
+
+    pair_data = read_pair_file(pair_file)
+
+    # for each reference view and the corresponding source views
+    for ref_view, src_views in pair_data:
+        # src_views = src_views[:args.num_view]
+        # load the camera parameters
+        ref_intrinsics, ref_extrinsics = read_camera_parameters(
+            os.path.join(scan_folder, 'cams/{:0>8}_cam.txt'.format(ref_view)))
+        # load the reference image
+        ref_img = read_img(
+            os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(ref_view)))
+        # load the estimated depth of the reference view
+        ref_depth_est = read_pfm(
+            os.path.join(out_folder,
+                         'depth_est/{:0>8}.pfm'.format(ref_view)))[0]
+        # load the photometric mask of the reference view
+        confidence = read_pfm(
+            os.path.join(out_folder,
+                         'confidence/{:0>8}.pfm'.format(ref_view)))[0]
+        photo_mask = confidence > 0.4
+
+        all_srcview_depth_ests = []
+        all_srcview_x = []
+        all_srcview_y = []
+        all_srcview_geomask = []
+
+        # compute the geometric mask
+        geo_mask_sum = 0
+        for src_view in src_views:
+            # camera parameters of the source view
+            src_intrinsics, src_extrinsics = read_camera_parameters(
+                os.path.join(scan_folder,
+                             'cams/{:0>8}_cam.txt'.format(src_view)))
+            # the estimated depth of the source view
+            src_depth_est = read_pfm(
+                os.path.join(out_folder,
+                             'depth_est/{:0>8}.pfm'.format(src_view)))[0]
+
+            geo_mask, depth_reprojected, x2d_src, y2d_src = check_geometric_consistency(
+                ref_depth_est, ref_intrinsics, ref_extrinsics, src_depth_est,
+                src_intrinsics, src_extrinsics)
+            geo_mask_sum += geo_mask.astype(np.int32)
+            all_srcview_depth_ests.append(depth_reprojected)
+            all_srcview_x.append(x2d_src)
+            all_srcview_y.append(y2d_src)
+            all_srcview_geomask.append(geo_mask)
+
+        depth_est_averaged = (sum(all_srcview_depth_ests) + ref_depth_est) / (
+            geo_mask_sum + 1)
+        # at least 3 source views matched
+        geo_mask = geo_mask_sum >= thres_view
+        final_mask = np.logical_and(photo_mask, geo_mask)
+
+        os.makedirs(os.path.join(out_folder, 'mask'), exist_ok=True)
+        save_mask(
+            os.path.join(out_folder, 'mask/{:0>8}_photo.png'.format(ref_view)),
+            photo_mask)
+        save_mask(
+            os.path.join(out_folder, 'mask/{:0>8}_geo.png'.format(ref_view)),
+            geo_mask)
+        save_mask(
+            os.path.join(out_folder, 'mask/{:0>8}_final.png'.format(ref_view)),
+            final_mask)
+
+        height, width = depth_est_averaged.shape[:2]
+        x, y = np.meshgrid(np.arange(0, width), np.arange(0, height))
+        valid_points = final_mask
+        x, y, depth = x[valid_points], y[valid_points], depth_est_averaged[
+            valid_points]
+
+        color = ref_img[valid_points]
+
+        xyz_ref = np.matmul(
+            np.linalg.inv(ref_intrinsics),
+            np.vstack((x, y, np.ones_like(x))) * depth)
+        xyz_world = np.matmul(
+            np.linalg.inv(ref_extrinsics), np.vstack(
+                (xyz_ref, np.ones_like(x))))[:3]
+        vertexs.append(xyz_world.transpose((1, 0)))
+        vertex_colors.append((color * 255).astype(np.uint8))
+
+    vertexs = np.concatenate(vertexs, axis=0)
+    vertex_colors = np.concatenate(vertex_colors, axis=0)
+    vertexs = np.array([tuple(v) for v in vertexs],
+                       dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')])
+    vertex_colors = np.array([tuple(v) for v in vertex_colors],
+                             dtype=[('red', 'u1'), ('green', 'u1'),
+                                    ('blue', 'u1')])
+
+    vertex_all = np.empty(
+        len(vertexs), vertexs.dtype.descr + vertex_colors.dtype.descr)
+    for prop in vertexs.dtype.names:
+        vertex_all[prop] = vertexs[prop]
+    for prop in vertex_colors.dtype.names:
+        vertex_all[prop] = vertex_colors[prop]
+
+    el = PlyElement.describe(vertex_all, 'vertex')
+    # PlyData([el]).write(plyfilename)
+    pcd = PlyData([el])
+
+    return pcd
+
+
+def pcd_depth_filter(scene, test_dir, save_dir, thres_view):
+    old_scene_folder = os.path.join(test_dir, scene)
+    new_scene_folder = os.path.join(save_dir, scene)
+    out_folder = os.path.join(save_dir, scene)
+    pcd = filter_depth(old_scene_folder, new_scene_folder, out_folder,
+                       thres_view)
+    return pcd
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/general_eval_dataset.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/general_eval_dataset.py
new file mode 100644
index 000000000..0719d3fa0
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/general_eval_dataset.py
@@ -0,0 +1,374 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import re
+import sys
+
+import cv2
+import numpy as np
+from PIL import Image
+from torch.utils.data import Dataset
+from torchvision import transforms
+
+
+def read_pfm(filename):
+    file = open(filename, 'rb')
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().decode('utf-8').rstrip()
+    if header == 'PF':
+        color = True
+    elif header == 'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8'))
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().rstrip())
+    if scale < 0:  # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>'  # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    file.close()
+    return data, scale
+
+
+def save_pfm(filename, image, scale=1):
+    file = open(filename, 'wb')
+    color = None
+
+    image = np.flipud(image)
+
+    if image.dtype.name != 'float32':
+        raise Exception('Image dtype must be float32.')
+
+    if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+        color = True
+    elif len(image.shape) == 2 or len(
+            image.shape) == 3 and image.shape[2] == 1:  # greyscale
+        color = False
+    else:
+        raise Exception(
+            'Image must have H x W x 3, H x W x 1 or H x W dimensions.')
+
+    file.write('PF\n'.encode('utf-8') if color else 'Pf\n'.encode('utf-8'))
+    file.write('{} {}\n'.format(image.shape[1],
+                                image.shape[0]).encode('utf-8'))
+
+    endian = image.dtype.byteorder
+
+    if endian == '<' or endian == '=' and sys.byteorder == 'little':
+        scale = -scale
+
+    file.write(('%f\n' % scale).encode('utf-8'))
+
+    image.tofile(file)
+    file.close()
+
+
+S_H, S_W = 0, 0
+
+
+class MVSDataset(Dataset):
+
+    def __init__(self, root_dir, list_file, mode, n_views, **kwargs):
+        super(MVSDataset, self).__init__()
+
+        self.root_dir = root_dir
+        self.list_file = list_file
+        self.mode = mode
+        self.n_views = n_views
+
+        assert self.mode in ['train', 'val', 'test']
+
+        self.total_depths = 192
+        self.interval_scale = 1.06
+
+        self.data_scale = kwargs.get('data_scale', 'mid')  # mid / raw
+        self.robust_train = kwargs.get('robust_train', False)  # True / False
+        self.color_augment = transforms.ColorJitter(
+            brightness=0.5, contrast=0.5)
+
+        if self.mode == 'test':
+            self.max_wh = kwargs.get('max_wh', (1600, 1200))
+            self.max_w, self.max_h = self.max_wh
+
+        self.fix_res = kwargs.get(
+            'fix_res', False)  # whether to fix the resolution of input image.
+        self.fix_wh = False
+
+        # self.metas = self.build_metas()
+        self.metas = self.build_list()
+
+    def build_list(self):
+        metas = []
+        scans = self.list_file
+        # logger.info("MVSDataset scans:", scans)
+
+        interval_scale_dict = {}
+        # scans
+        for scan in scans:
+            # determine the interval scale of each scene. default is 1.06
+            if isinstance(self.interval_scale, float):
+                interval_scale_dict[scan] = self.interval_scale
+            else:
+                interval_scale_dict[scan] = self.interval_scale[scan]
+
+            pair_file = '{}/pair.txt'.format(scan)
+            # read the pair file
+            with open(os.path.join(self.root_dir, pair_file)) as f:
+                num_viewpoint = int(f.readline())
+                # viewpoints
+                for view_idx in range(num_viewpoint):
+                    ref_view = int(f.readline().rstrip())
+                    src_views = [
+                        int(x) for x in f.readline().rstrip().split()[1::2]
+                    ]
+                    # filter by no src view and fill to nviews
+                    if len(src_views) > 0:
+                        if len(src_views) < self.n_views:
+                            src_views += [src_views[0]] * (
+                                self.n_views - len(src_views))
+                        metas.append((scan, ref_view, src_views, scan))
+
+        self.interval_scale = interval_scale_dict
+        return metas
+
+    def __len__(self):
+        return len(self.metas)
+
+    def read_cam_file(self, filename, interval_scale):
+        with open(filename) as f:
+            lines = f.readlines()
+            lines = [line.rstrip() for line in lines]
+        # extrinsics: line [1,5), 4x4 matrix
+        extrinsics = np.fromstring(
+            ' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4))
+        # intrinsics: line [7-10), 3x3 matrix
+        intrinsics = np.fromstring(
+            ' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3))
+        intrinsics[:2, :] /= 4.0
+        # depth_min & depth_interval: line 11
+        depth_min = float(lines[11].split()[0])
+        depth_interval = float(lines[11].split()[1])
+
+        if len(lines[11].split()) >= 3:
+            num_depth = lines[11].split()[2]
+            depth_max = depth_min + int(float(num_depth)) * depth_interval
+            depth_interval = (depth_max - depth_min) / self.total_depths
+
+        depth_interval *= interval_scale
+
+        return intrinsics, extrinsics, depth_min, depth_interval
+
+    def read_img(self, filename):
+        img = Image.open(filename)
+        if self.mode == 'train' and self.robust_train:
+            img = self.color_augment(img)
+        # scale 0~255 to 0~1
+        np_img = np.array(img, dtype=np.float32) / 255.
+        return np_img
+
+    def crop_img(self, img):
+        raw_h, raw_w = img.shape[:2]
+        start_h = (raw_h - 1024) // 2
+        start_w = (raw_w - 1280) // 2
+        return img[start_h:start_h + 1024,
+                   start_w:start_w + 1280, :]  # (1024, 1280)
+
+    def prepare_img(self, hr_img):
+        h, w = hr_img.shape
+        if self.data_scale == 'mid':
+            hr_img_ds = cv2.resize(
+                hr_img, (w // 2, h // 2), interpolation=cv2.INTER_NEAREST)
+            h, w = hr_img_ds.shape
+            target_h, target_w = 512, 640
+            start_h, start_w = (h - target_h) // 2, (w - target_w) // 2
+            hr_img_crop = hr_img_ds[start_h:start_h + target_h,
+                                    start_w:start_w + target_w]
+        elif self.data_scale == 'raw':
+            hr_img_crop = hr_img[h // 2 - 1024 // 2:h // 2 + 1024 // 2,
+                                 w // 2 - 1280 // 2:w // 2
+                                 + 1280 // 2]  # (1024, 1280)
+        return hr_img_crop
+
+    def scale_mvs_input(self, img, intrinsics, max_w, max_h, base=64):
+        h, w = img.shape[:2]
+        if h > max_h or w > max_w:
+            scale = 1.0 * max_h / h
+            if scale * w > max_w:
+                scale = 1.0 * max_w / w
+            new_w, new_h = scale * w // base * base, scale * h // base * base
+        else:
+            new_w, new_h = 1.0 * w // base * base, 1.0 * h // base * base
+
+        scale_w = 1.0 * new_w / w
+        scale_h = 1.0 * new_h / h
+        intrinsics[0, :] *= scale_w
+        intrinsics[1, :] *= scale_h
+
+        img = cv2.resize(img, (int(new_w), int(new_h)))
+
+        return img, intrinsics
+
+    def read_mask_hr(self, filename):
+        img = Image.open(filename)
+        np_img = np.array(img, dtype=np.float32)
+        np_img = (np_img > 10).astype(np.float32)
+        np_img = self.prepare_img(np_img)
+
+        h, w = np_img.shape
+        np_img_ms = {
+            'stage1':
+            cv2.resize(
+                np_img, (w // 8, h // 8), interpolation=cv2.INTER_NEAREST),
+            'stage2':
+            cv2.resize(
+                np_img, (w // 4, h // 4), interpolation=cv2.INTER_NEAREST),
+            'stage3':
+            cv2.resize(
+                np_img, (w // 2, h // 2), interpolation=cv2.INTER_NEAREST),
+            'stage4':
+            np_img,
+        }
+        return np_img_ms
+
+    def read_depth_hr(self, filename, scale):
+        depth_hr = np.array(read_pfm(filename)[0], dtype=np.float32) * scale
+        depth_lr = self.prepare_img(depth_hr)
+
+        h, w = depth_lr.shape
+        depth_lr_ms = {
+            'stage1':
+            cv2.resize(
+                depth_lr, (w // 8, h // 8), interpolation=cv2.INTER_NEAREST),
+            'stage2':
+            cv2.resize(
+                depth_lr, (w // 4, h // 4), interpolation=cv2.INTER_NEAREST),
+            'stage3':
+            cv2.resize(
+                depth_lr, (w // 2, h // 2), interpolation=cv2.INTER_NEAREST),
+            'stage4':
+            depth_lr,
+        }
+        return depth_lr_ms
+
+    def __getitem__(self, idx):
+        global S_H, S_W
+        meta = self.metas[idx]
+        scan, ref_view, src_views, scene_name = meta
+        # use only the reference view and first nviews-1 source views
+        view_ids = [ref_view] + src_views[:self.n_views - 1]
+
+        scale_ratio = 1
+
+        imgs = []
+        depth_values = None
+        proj_matrices = []
+
+        for i, vid in enumerate(view_ids):
+            img_filename = os.path.join(
+                self.root_dir, '{}/images_post/{:0>8}.jpg'.format(scan, vid))
+            if not os.path.exists(img_filename):
+                img_filename = os.path.join(
+                    self.root_dir, '{}/images/{:0>8}.jpg'.format(scan, vid))
+
+            proj_mat_filename = os.path.join(
+                self.root_dir, '{}/cams/{:0>8}_cam.txt'.format(scan, vid))
+
+            img = self.read_img(img_filename)
+            intrinsics, extrinsics, depth_min, depth_interval = self.read_cam_file(
+                proj_mat_filename,
+                interval_scale=self.interval_scale[scene_name])
+            # scale input
+            img, intrinsics = self.scale_mvs_input(img, intrinsics, self.max_w,
+                                                   self.max_h)
+
+            if self.fix_res:
+                # using the same standard height or width in entire scene.
+                S_H, S_W = img.shape[:2]
+                self.fix_res = False
+                self.fix_wh = True
+
+            if i == 0:
+                if not self.fix_wh:
+                    # using the same standard height or width in each nviews.
+                    S_H, S_W = img.shape[:2]
+
+            # resize to standard height or width
+            c_h, c_w = img.shape[:2]
+            if (c_h != S_H) or (c_w != S_W):
+                scale_h = 1.0 * S_H / c_h
+                scale_w = 1.0 * S_W / c_w
+                img = cv2.resize(img, (S_W, S_H))
+                intrinsics[0, :] *= scale_w
+                intrinsics[1, :] *= scale_h
+
+            #################
+            imgs.append(img.transpose(2, 0, 1))
+
+            # reference view
+            if i == 0:
+                # @Note depth values
+                diff = 0.5 if self.mode in ['test', 'val'] else 0
+                depth_max = depth_interval * (self.total_depths
+                                              - diff) + depth_min
+                depth_values = np.array(
+                    [depth_min * scale_ratio, depth_max * scale_ratio],
+                    dtype=np.float32)
+
+            proj_mat = np.zeros(shape=(2, 4, 4), dtype=np.float32)
+            proj_mat[0, :4, :4] = extrinsics
+            proj_mat[1, :3, :3] = intrinsics
+            proj_matrices.append(proj_mat)
+
+        proj_matrices = np.stack(proj_matrices)
+        intrinsics = np.stack(intrinsics)
+        stage1_pjmats = proj_matrices.copy()
+        stage1_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] / 2.0
+        stage1_ins = intrinsics.copy()
+        stage1_ins[:2, :] = intrinsics[:2, :] / 2.0
+        stage3_pjmats = proj_matrices.copy()
+        stage3_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] * 2
+        stage3_ins = intrinsics.copy()
+        stage3_ins[:2, :] = intrinsics[:2, :] * 2.0
+        stage4_pjmats = proj_matrices.copy()
+        stage4_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] * 4
+        stage4_ins = intrinsics.copy()
+        stage4_ins[:2, :] = intrinsics[:2, :] * 4.0
+        proj_matrices = {
+            'stage1': stage1_pjmats,
+            'stage2': proj_matrices,
+            'stage3': stage3_pjmats,
+            'stage4': stage4_pjmats
+        }
+        intrinsics_matrices = {
+            'stage1': stage1_ins,
+            'stage2': intrinsics,
+            'stage3': stage3_ins,
+            'stage4': stage4_ins
+        }
+
+        sample = {
+            'imgs': imgs,
+            'proj_matrices': proj_matrices,
+            'intrinsics_matrices': intrinsics_matrices,
+            'depth_values': depth_values,
+            'filename': scan + '/{}/' + '{:0>8}'.format(view_ids[0]) + '{}'
+        }
+        return sample
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/geomvsnet_model.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/geomvsnet_model.py
new file mode 100644
index 000000000..0777945af
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/geomvsnet_model.py
@@ -0,0 +1,196 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path as osp
+import time
+
+import cv2
+import numpy as np
+import torch
+from easydict import EasyDict as edict
+from torch.utils.data import DataLoader
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .colmap2mvsnet import processing_single_scene
+from .depth_filter import pcd_depth_filter
+from .general_eval_dataset import MVSDataset, save_pfm
+from .models.geomvsnet import GeoMVSNet
+from .models.utils import *
+from .models.utils.opts import get_opts
+from .utils import (generate_pointcloud, numpy2torch, tensor2numpy, tocuda,
+                    write_cam)
+
+logger = get_logger()
+
+
+@MODELS.register_module(
+    Tasks.image_multi_view_depth_estimation,
+    module_name=Models.image_geomvsnet_depth_estimation)
+class GeoMVSNetDepthEstimation(TorchModel):
+    '''
+    GeoMVSNet is a state-of-the-art MVS(multi-view stereo) depth estimation method.
+    For more details, please refer to https://github.com/doublez0108/geomvsnet
+    '''
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        self.n_views = 5
+        self.levels = 4
+        self.hypo_plane_num_stages = '8,8,4,4'
+        self.depth_interal_ratio_stages = '0.5,0.5,0.5,1'
+        self.feat_base_channel = 8
+        self.reg_base_channel = 8
+        self.group_cor_dim_stages = '8,8,4,4'
+        self.batch_size = 1
+
+        self.model = GeoMVSNet(
+            levels=self.levels,
+            hypo_plane_num_stages=[
+                int(n) for n in self.hypo_plane_num_stages.split(',')
+            ],
+            depth_interal_ratio_stages=[
+                float(ir) for ir in self.depth_interal_ratio_stages.split(',')
+            ],
+            feat_base_channel=self.feat_base_channel,
+            reg_base_channel=self.reg_base_channel,
+            group_cor_dim_stages=[
+                int(n) for n in self.group_cor_dim_stages.split(',')
+            ],
+        )
+
+        # load checkpoint file
+        ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        logger.info(f'loading model {ckpt_path}')
+        state_dict = torch.load(ckpt_path, map_location=torch.device('cpu'))
+        self.model.load_state_dict(state_dict['model'], strict=False)
+
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu'
+
+        self.model.to(self.device)
+        self.model.eval()
+        logger.info(f'model init done! Device:{self.device}')
+
+    def preprocess_make_pair(self, inputs):
+
+        data = inputs['input_dir']
+        casmvs_inp_dir = inputs['casmvs_inp_dir']
+
+        args = edict()
+        args.dense_folder = data
+        args.save_folder = casmvs_inp_dir
+        args.max_d = 192
+        args.interval_scale = 1.06
+        args.theta0 = 5
+        args.sigma1 = 1
+        args.sigma2 = 10
+        args.model_ext = '.bin'
+
+        logger.info('preprocess of making pair data start, folder: %s',
+                    args.dense_folder)
+        processing_single_scene(args)
+        logger.info('preprocess of making pair data done')
+
+    def forward(self, inputs):
+
+        test_dir = os.path.dirname(inputs['casmvs_inp_dir'])
+        scene = os.path.basename(inputs['casmvs_inp_dir'])
+        test_list = [scene]
+        save_dir = inputs['casmvs_res_dir']
+
+        logger.info('depth estimation start')
+
+        test_dataset = MVSDataset(
+            test_dir, test_list, 'test', self.n_views, max_wh=(1600, 1200))
+        TestImgLoader = DataLoader(
+            test_dataset,
+            self.batch_size,
+            shuffle=False,
+            num_workers=4,
+            drop_last=False)
+
+        total_time = 0
+        with torch.no_grad():
+            for batch_idx, sample in enumerate(TestImgLoader):
+                sample_cuda = tocuda(sample)
+
+                # @Note GeoMVSNet main
+                start_time = time.time()
+                outputs = self.model(sample_cuda['imgs'],
+                                     sample_cuda['proj_matrices'],
+                                     sample_cuda['intrinsics_matrices'],
+                                     sample_cuda['depth_values'],
+                                     sample['filename'])
+                end_time = time.time()
+                total_time += end_time - start_time
+
+                outputs = tensor2numpy(outputs)
+                del sample_cuda
+                filenames = sample['filename']
+                cams = sample['proj_matrices']['stage{}'.format(
+                    self.levels)].numpy()
+                imgs = sample['imgs']
+                logger.info('Iter {}/{}, Time:{:.3f} Res:{}'.format(
+                    batch_idx, len(TestImgLoader), end_time - start_time,
+                    imgs[0].shape))
+
+                for filename, cam, img, depth_est, photometric_confidence in zip(
+                        filenames, cams, imgs, outputs['depth'],
+                        outputs['photometric_confidence']):
+                    img = img[0].numpy()  # ref view
+                    cam = cam[0]  # ref cam
+
+                    depth_filename = os.path.join(
+                        save_dir, filename.format('depth_est', '.pfm'))
+                    confidence_filename = os.path.join(
+                        save_dir, filename.format('confidence', '.pfm'))
+                    cam_filename = os.path.join(
+                        save_dir, filename.format('cams', '_cam.txt'))
+                    img_filename = os.path.join(
+                        save_dir, filename.format('images', '.jpg'))
+                    os.makedirs(
+                        depth_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(
+                        confidence_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(cam_filename.rsplit('/', 1)[0], exist_ok=True)
+                    os.makedirs(img_filename.rsplit('/', 1)[0], exist_ok=True)
+
+                    # save depth maps
+                    save_pfm(depth_filename, depth_est)
+
+                    # save confidence maps
+                    confidence_list = [
+                        outputs['stage{}'.format(i)]
+                        ['photometric_confidence'].squeeze(0)
+                        for i in range(1, self.levels + 1)
+                    ]
+                    print('confidence_list', len(confidence_list))
+                    photometric_confidence = confidence_list[-1]
+                    save_pfm(confidence_filename, photometric_confidence)
+
+                    # save camera info
+                    write_cam(cam_filename, cam)
+                    img = np.clip(np.transpose(img, (1, 2, 0)) * 255, 0,
+                                  255).astype(np.uint8)
+                    img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+                    cv2.imwrite(img_filename, img_bgr)
+
+        torch.cuda.empty_cache()
+        logger.info('depth estimation end')
+        return inputs
+
+    def postprocess(self, inputs):
+        test_dir = os.path.dirname(inputs['casmvs_inp_dir'])
+        scene = os.path.basename(inputs['casmvs_inp_dir'])
+        logger.info('depth fusion start')
+        pcd = pcd_depth_filter(
+            scene, test_dir, inputs['casmvs_res_dir'], thres_view=4)
+        logger.info('depth fusion end')
+        return pcd
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/__init__.py
new file mode 100644
index 000000000..4f29d642e
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/__init__.py
@@ -0,0 +1,2 @@
+from .geomvsnet import GeoMVSNet
+from .loss import geomvsnet_loss
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/filter.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/filter.py
new file mode 100644
index 000000000..9482ebace
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/filter.py
@@ -0,0 +1,38 @@
+# @Description: Basic implementation of Frequency Domain Filtering strategy (Sec 3.2 in the paper).
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+
+
+def frequency_domain_filter(depth, rho_ratio):
+    """
+    large rho_ratio -> more information filtered
+    """
+    f = torch.fft.fft2(depth)
+    fshift = torch.fft.fftshift(f)
+
+    b, h, w = depth.shape
+    k_h, k_w = h / rho_ratio, w / rho_ratio
+
+    fshift[:, :int(h / 2 - k_h / 2), :] = 0
+    fshift[:, int(h / 2 + k_h / 2):, :] = 0
+    fshift[:, :, :int(w / 2 - k_w / 2)] = 0
+    fshift[:, :, int(w / 2 + k_w / 2):] = 0
+
+    ishift = torch.fft.ifftshift(fshift)
+    idepth = torch.fft.ifft2(ishift)
+    depth_filtered = torch.abs(idepth)
+
+    return depth_filtered
+
+
+def visual_fft_fig(fshift):
+    fft_fig = torch.abs(20 * torch.log(fshift))
+    plt.figure(figsize=(10, 10))
+    plt.subplot(121)
+    plt.imshow(fft_fig[0, :, :], cmap='gray')
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geometry.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geometry.py
new file mode 100644
index 000000000..f108b05cc
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geometry.py
@@ -0,0 +1,856 @@
+# @Description: Geometric Prior Guided Feature Fusion & Probability Volume Geometry Embedding (Sec 3.1 in the paper).
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .submodules import ConvBnReLU3D
+
+
+class GeoFeatureFusion(nn.Module):
+
+    def __init__(self,
+                 convolutional_layer_encoding='z',
+                 mask_type='basic',
+                 add_origin_feat_flag=True):
+        super(GeoFeatureFusion, self).__init__()
+
+        self.convolutional_layer_encoding = convolutional_layer_encoding  # std / uv / z / xyz
+        self.mask_type = mask_type  # basic / mean
+        self.add_origin_feat_flag = add_origin_feat_flag  # True / False
+
+        if self.convolutional_layer_encoding == 'std':
+            self.geoplanes = 0
+        elif self.convolutional_layer_encoding == 'uv':
+            self.geoplanes = 2
+        elif self.convolutional_layer_encoding == 'z':
+            self.geoplanes = 1
+        elif self.convolutional_layer_encoding == 'xyz':
+            self.geoplanes = 3
+            self.geofeature = GeometryFeature()
+
+        # rgb encoder
+        self.rgb_conv_init = convbnrelu(
+            in_channels=4, out_channels=8, kernel_size=5, stride=1, padding=2)
+
+        self.rgb_encoder_layer1 = BasicBlockGeo(
+            inplanes=8, planes=16, stride=2, geoplanes=self.geoplanes)
+        self.rgb_encoder_layer2 = BasicBlockGeo(
+            inplanes=16, planes=32, stride=1, geoplanes=self.geoplanes)
+        self.rgb_encoder_layer3 = BasicBlockGeo(
+            inplanes=32, planes=64, stride=2, geoplanes=self.geoplanes)
+        self.rgb_encoder_layer4 = BasicBlockGeo(
+            inplanes=64, planes=128, stride=1, geoplanes=self.geoplanes)
+        self.rgb_encoder_layer5 = BasicBlockGeo(
+            inplanes=128, planes=256, stride=2, geoplanes=self.geoplanes)
+
+        self.rgb_decoder_layer4 = deconvbnrelu(
+            in_channels=256,
+            out_channels=128,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.rgb_decoder_layer2 = deconvbnrelu(
+            in_channels=128,
+            out_channels=32,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.rgb_decoder_layer0 = deconvbnrelu(
+            in_channels=32,
+            out_channels=16,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+        self.rgb_decoder_layer = deconvbnrelu(
+            in_channels=16,
+            out_channels=8,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.rgb_decoder_output = deconvbnrelu(
+            in_channels=8,
+            out_channels=2,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+
+        # depth encoder
+        self.depth_conv_init = convbnrelu(
+            in_channels=2, out_channels=8, kernel_size=5, stride=1, padding=2)
+
+        self.depth_layer1 = BasicBlockGeo(
+            inplanes=8, planes=16, stride=2, geoplanes=self.geoplanes)
+        self.depth_layer2 = BasicBlockGeo(
+            inplanes=16, planes=32, stride=1, geoplanes=self.geoplanes)
+        self.depth_layer3 = BasicBlockGeo(
+            inplanes=64, planes=64, stride=2, geoplanes=self.geoplanes)
+        self.depth_layer4 = BasicBlockGeo(
+            inplanes=64, planes=128, stride=1, geoplanes=self.geoplanes)
+        self.depth_layer5 = BasicBlockGeo(
+            inplanes=256, planes=256, stride=2, geoplanes=self.geoplanes)
+
+        self.decoder_layer3 = deconvbnrelu(
+            in_channels=256,
+            out_channels=128,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.decoder_layer4 = deconvbnrelu(
+            in_channels=128,
+            out_channels=64,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+        self.decoder_layer5 = deconvbnrelu(
+            in_channels=64,
+            out_channels=32,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.decoder_layer6 = deconvbnrelu(
+            in_channels=32,
+            out_channels=16,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+        self.decoder_layer7 = deconvbnrelu(
+            in_channels=16,
+            out_channels=8,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+
+        # output
+        self.rgbdepth_decoder_stage1 = deconvbnrelu(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.rgbdepth_decoder_stage2 = deconvbnrelu(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=5,
+            stride=2,
+            padding=2,
+            output_padding=1)
+        self.rgbdepth_decoder_stage3 = deconvbnrelu(
+            in_channels=8,
+            out_channels=8,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+
+        self.final_decoder_stage1 = deconvbnrelu(
+            in_channels=32,
+            out_channels=32,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+        self.final_decoder_stage2 = deconvbnrelu(
+            in_channels=16,
+            out_channels=16,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+        self.final_decoder_stage3 = deconvbnrelu(
+            in_channels=8,
+            out_channels=8,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            output_padding=0)
+
+        self.softmax = nn.Softmax(dim=1)
+        self.pooling = nn.AvgPool2d(kernel_size=2)
+        self.sparsepooling = SparseDownSampleClose(stride=2)
+
+        weights_init(self)
+
+    def forward(self, rgb, depth, confidence, depth_values, stage_idx,
+                origin_feat, intrinsics_matrices_stage):
+
+        rgb = rgb
+        depth_min, depth_max = depth_values[:, 0, None, None,
+                                            None], depth_values[:, -1, None,
+                                                                None, None]
+        d = (depth - depth_min) / (depth_max - depth_min)
+
+        if self.mask_type == 'basic':
+            valid_mask = torch.where(d > 0, torch.full_like(d, 1.0),
+                                     torch.full_like(d, 0.0))
+        elif self.mask_type == 'mean':
+            valid_mask = torch.where(
+                torch.logical_and(d > 0, confidence > confidence.mean()),
+                torch.full_like(d, 1.0), torch.full_like(d, 0.0))
+
+        # pre-data preparation
+        if self.convolutional_layer_encoding in ['uv', 'xyz']:
+            B, _, W, H = rgb.shape
+            position = AddCoordsNp(H, W)
+            position = position.call()
+            position = torch.from_numpy(position).to(rgb.device).repeat(
+                B, 1, 1, 1).transpose(-1, 1)
+            unorm = position[:, 0:1, :, :]
+            vnorm = position[:, 1:2, :, :]
+
+            vnorm_s2 = self.pooling(vnorm)
+            vnorm_s3 = self.pooling(vnorm_s2)
+            vnorm_s4 = self.pooling(vnorm_s3)
+
+            unorm_s2 = self.pooling(unorm)
+            unorm_s3 = self.pooling(unorm_s2)
+            unorm_s4 = self.pooling(unorm_s3)
+
+        if self.convolutional_layer_encoding in ['z', 'xyz']:
+            d_s2, vm_s2 = self.sparsepooling(d, valid_mask)
+            d_s3, vm_s3 = self.sparsepooling(d_s2, vm_s2)
+            d_s4, vm_s4 = self.sparsepooling(d_s3, vm_s3)
+
+        if self.convolutional_layer_encoding == 'xyz':
+            K = intrinsics_matrices_stage
+            f352 = K[:, 1, 1]
+            f352 = f352.unsqueeze(1)
+            f352 = f352.unsqueeze(2)
+            f352 = f352.unsqueeze(3)
+            c352 = K[:, 1, 2]
+            c352 = c352.unsqueeze(1)
+            c352 = c352.unsqueeze(2)
+            c352 = c352.unsqueeze(3)
+            f1216 = K[:, 0, 0]
+            f1216 = f1216.unsqueeze(1)
+            f1216 = f1216.unsqueeze(2)
+            f1216 = f1216.unsqueeze(3)
+            c1216 = K[:, 0, 2]
+            c1216 = c1216.unsqueeze(1)
+            c1216 = c1216.unsqueeze(2)
+            c1216 = c1216.unsqueeze(3)
+
+        # geometric info
+        if self.convolutional_layer_encoding == 'std':
+            geo_s1 = None
+            geo_s2 = None
+            geo_s3 = None
+            geo_s4 = None
+        elif self.convolutional_layer_encoding == 'uv':
+            geo_s1 = torch.cat((vnorm, unorm), dim=1)
+            geo_s2 = torch.cat((vnorm_s2, unorm_s2), dim=1)
+            geo_s3 = torch.cat((vnorm_s3, unorm_s3), dim=1)
+            geo_s4 = torch.cat((vnorm_s4, unorm_s4), dim=1)
+        elif self.convolutional_layer_encoding == 'z':
+            geo_s1 = d
+            geo_s2 = d_s2
+            geo_s3 = d_s3
+            geo_s4 = d_s4
+        elif self.convolutional_layer_encoding == 'xyz':
+            geo_s1 = self.geofeature(d, vnorm, unorm, H, W, c352, c1216, f352,
+                                     f1216)
+            geo_s2 = self.geofeature(d_s2, vnorm_s2, unorm_s2, H / 2, W / 2,
+                                     c352, c1216, f352, f1216)
+            geo_s3 = self.geofeature(d_s3, vnorm_s3, unorm_s3, H / 4, W / 4,
+                                     c352, c1216, f352, f1216)
+            geo_s4 = self.geofeature(d_s4, vnorm_s4, unorm_s4, H / 8, W / 8,
+                                     c352, c1216, f352, f1216)
+
+        # -----------------------------------------------------------------------------------------
+
+        # 128*160 -> 256*320 -> 512*640
+        rgb_feature = self.rgb_conv_init(torch.cat((rgb, d), dim=1))  # b 8 h w
+        rgb_feature1 = self.rgb_encoder_layer1(rgb_feature, geo_s1,
+                                               geo_s2)  # b 16 h/2 w/2
+        rgb_feature2 = self.rgb_encoder_layer2(rgb_feature1, geo_s2,
+                                               geo_s2)  # b 32 h/2 w/2
+        rgb_feature3 = self.rgb_encoder_layer3(rgb_feature2, geo_s2,
+                                               geo_s3)  # b 64 h/4 w/4
+        rgb_feature4 = self.rgb_encoder_layer4(rgb_feature3, geo_s3,
+                                               geo_s3)  # b 128 h/4 w/4
+        rgb_feature5 = self.rgb_encoder_layer5(rgb_feature4, geo_s3,
+                                               geo_s4)  # b 256 h/8 w/8
+
+        rgb_feature_decoder4 = self.rgb_decoder_layer4(rgb_feature5)
+        rgb_feature4_plus = rgb_feature_decoder4 + rgb_feature4  # b 128 h/4 w/4
+
+        rgb_feature_decoder2 = self.rgb_decoder_layer2(rgb_feature4_plus)
+        rgb_feature2_plus = rgb_feature_decoder2 + rgb_feature2  # b 32 h/2 w/2
+
+        rgb_feature_decoder0 = self.rgb_decoder_layer0(rgb_feature2_plus)
+        rgb_feature0_plus = rgb_feature_decoder0 + rgb_feature1  # b 16 h/2 w/2
+
+        rgb_feature_decoder = self.rgb_decoder_layer(rgb_feature0_plus)
+        rgb_feature_plus = rgb_feature_decoder + rgb_feature  # b 8 h w
+
+        rgb_output = self.rgb_decoder_output(rgb_feature_plus)  # b 2 h w
+
+        rgb_depth = rgb_output[:, 0:1, :, :]
+        # rgb_conf = rgb_output[:, 1:2, :, :]
+
+        # -----------------------------------------------------------------------------------------
+
+        sparsed_feature = self.depth_conv_init(
+            torch.cat((d, rgb_depth), dim=1))  # b 8 h w
+        sparsed_feature1 = self.depth_layer1(sparsed_feature, geo_s1,
+                                             geo_s2)  # b 16 h/2 w/2
+        sparsed_feature2 = self.depth_layer2(sparsed_feature1, geo_s2,
+                                             geo_s2)  # b 32 h/2 w/2
+
+        sparsed_feature2_plus = torch.cat(
+            [rgb_feature2_plus, sparsed_feature2], 1)
+        sparsed_feature3 = self.depth_layer3(sparsed_feature2_plus, geo_s2,
+                                             geo_s3)  # b 64 h/4 w/4
+        sparsed_feature4 = self.depth_layer4(sparsed_feature3, geo_s3,
+                                             geo_s3)  # b 128 h/4 w/4
+
+        sparsed_feature4_plus = torch.cat(
+            [rgb_feature4_plus, sparsed_feature4], 1)
+        sparsed_feature5 = self.depth_layer5(sparsed_feature4_plus, geo_s3,
+                                             geo_s4)  # b 256 h/8 w/8
+
+        # -----------------------------------------------------------------------------------------
+
+        fusion3 = rgb_feature5 + sparsed_feature5
+        decoder_feature3 = self.decoder_layer3(fusion3)  # b 128 h/4 w/4
+
+        fusion4 = sparsed_feature4 + decoder_feature3
+        decoder_feature4 = self.decoder_layer4(fusion4)  # b 64 h/4 w/4
+
+        if stage_idx >= 1:
+            decoder_feature5 = self.decoder_layer5(decoder_feature4)
+            fusion5 = sparsed_feature2 + decoder_feature5  # b 32 h/2 w/2
+            if stage_idx == 1:
+                rgbdepth_feature = self.rgbdepth_decoder_stage1(fusion5)
+                if self.add_origin_feat_flag:
+                    final_feature = self.final_decoder_stage1(rgbdepth_feature
+                                                              + origin_feat)
+                else:
+                    final_feature = self.final_decoder_stage1(rgbdepth_feature)
+
+        if stage_idx >= 2:
+            decoder_feature6 = self.decoder_layer6(decoder_feature5)
+            fusion6 = sparsed_feature1 + decoder_feature6  # b 16 h/2 w/2
+            if stage_idx == 2:
+                rgbdepth_feature = self.rgbdepth_decoder_stage2(fusion6)
+                if self.add_origin_feat_flag:
+                    final_feature = self.final_decoder_stage2(rgbdepth_feature
+                                                              + origin_feat)
+                else:
+                    final_feature = self.final_decoder_stage2(rgbdepth_feature)
+
+        if stage_idx >= 3:
+            decoder_feature7 = self.decoder_layer7(decoder_feature6)
+            fusion7 = sparsed_feature + decoder_feature7  # b 8 h w
+            if stage_idx == 3:
+                rgbdepth_feature = self.rgbdepth_decoder_stage3(fusion7)
+                if self.add_origin_feat_flag:
+                    final_feature = self.final_decoder_stage3(rgbdepth_feature
+                                                              + origin_feat)
+                else:
+                    final_feature = self.final_decoder_stage3(rgbdepth_feature)
+
+        return final_feature
+
+
+class GeoRegNet2d(nn.Module):
+
+    def __init__(self,
+                 input_channel=128,
+                 base_channel=32,
+                 convolutional_layer_encoding='std'):
+        super(GeoRegNet2d, self).__init__()
+
+        self.convolutional_layer_encoding = convolutional_layer_encoding  # std / uv / z / xyz
+        self.mask_type = 'basic'  # basic / mean
+
+        if self.convolutional_layer_encoding == 'std':
+            self.geoplanes = 0
+        elif self.convolutional_layer_encoding == 'z':
+            self.geoplanes = 1
+
+        self.conv_init = ConvBnReLU3D(
+            input_channel,
+            out_channels=8,
+            kernel_size=(1, 3, 3),
+            pad=(0, 1, 1))
+        self.encoder_layer1 = Reg_BasicBlockGeo(
+            inplanes=8,
+            planes=16,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            padding=(0, 1, 1),
+            geoplanes=self.geoplanes)
+        self.encoder_layer2 = Reg_BasicBlockGeo(
+            inplanes=16,
+            planes=32,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            padding=(0, 1, 1),
+            geoplanes=self.geoplanes)
+        self.encoder_layer3 = Reg_BasicBlockGeo(
+            inplanes=32,
+            planes=64,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            padding=(0, 1, 1),
+            geoplanes=self.geoplanes)
+        self.encoder_layer4 = Reg_BasicBlockGeo(
+            inplanes=64,
+            planes=128,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            padding=(0, 1, 1),
+            geoplanes=self.geoplanes)
+        self.encoder_layer5 = Reg_BasicBlockGeo(
+            inplanes=128,
+            planes=256,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            padding=(0, 1, 1),
+            geoplanes=self.geoplanes)
+
+        self.decoder_layer4 = reg_deconvbnrelu(
+            in_channels=256,
+            out_channels=128,
+            kernel_size=(1, 5, 5),
+            stride=(1, 2, 2),
+            padding=(0, 2, 2),
+            output_padding=(0, 1, 1))
+        self.decoder_layer3 = reg_deconvbnrelu(
+            in_channels=128,
+            out_channels=64,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            padding=(0, 1, 1),
+            output_padding=0)
+        self.decoder_layer2 = reg_deconvbnrelu(
+            in_channels=64,
+            out_channels=32,
+            kernel_size=(1, 5, 5),
+            stride=(1, 2, 2),
+            padding=(0, 2, 2),
+            output_padding=(0, 1, 1))
+        self.decoder_layer1 = reg_deconvbnrelu(
+            in_channels=32,
+            out_channels=16,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            padding=(0, 1, 1),
+            output_padding=0)
+        self.decoder_layer = reg_deconvbnrelu(
+            in_channels=16,
+            out_channels=8,
+            kernel_size=(1, 5, 5),
+            stride=(1, 2, 2),
+            padding=(0, 2, 2),
+            output_padding=(0, 1, 1))
+
+        self.prob = reg_deconvbnrelu(
+            in_channels=8,
+            out_channels=1,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            padding=(0, 1, 1),
+            output_padding=0)
+
+        self.depthpooling = nn.MaxPool3d((2, 1, 1), (2, 1, 1))
+        self.basicpooling = nn.MaxPool3d((1, 2, 2), (1, 2, 2))
+
+        weights_init(self)
+
+    def forward(self, x, stage_idx, geo_reg_data=None):
+
+        B, C, D, W, H = x.shape
+
+        if stage_idx >= 1 and self.convolutional_layer_encoding == 'z':
+            prob_volume = geo_reg_data['prob_volume_last'].unsqueeze(
+                1)  # B 1 D H W
+        else:
+            assert self.convolutional_layer_encoding == 'std'
+
+        # geometric info
+        if self.convolutional_layer_encoding == 'std':
+            geo_s1 = None
+            geo_s2 = None
+            geo_s3 = None
+            # geo_s4 = None
+        elif self.convolutional_layer_encoding == 'z':
+            if stage_idx == 2:
+                geo_s1 = self.depthpooling(prob_volume)
+            else:
+                geo_s1 = prob_volume  # B 1 D H W
+            geo_s2 = self.basicpooling(geo_s1)
+            geo_s3 = self.basicpooling(geo_s2)
+
+        feature = self.conv_init(x)  # B 8 D H W
+        feature1 = self.encoder_layer1(feature, geo_s1,
+                                       geo_s1)  # B  16 D H/2 W/2
+        feature2 = self.encoder_layer2(feature1, geo_s2,
+                                       geo_s2)  # B  32 D H/2 W/2
+        feature3 = self.encoder_layer3(feature2, geo_s2,
+                                       geo_s2)  # B  64 D H/4 W/4
+        feature4 = self.encoder_layer4(feature3, geo_s3,
+                                       geo_s3)  # B 128 D H/4 W/4
+        feature5 = self.encoder_layer5(feature4, geo_s3,
+                                       geo_s3)  # B 256 D H/8 W/8
+
+        feature_decoder4 = self.decoder_layer4(feature5)
+        feature4_plus = feature_decoder4 + feature4  # B 128 D H/4 W/4
+
+        feature_decoder3 = self.decoder_layer3(feature4_plus)
+        feature3_plus = feature_decoder3 + feature3  # B 64 D H/4 W/4
+
+        feature_decoder2 = self.decoder_layer2(feature3_plus)
+        feature2_plus = feature_decoder2 + feature2  # B 32 D H/2 W/2
+
+        feature_decoder1 = self.decoder_layer1(feature2_plus)
+        feature1_plus = feature_decoder1 + feature1  # B 16 D H/2 W/2
+
+        feature_decoder = self.decoder_layer(feature1_plus)
+        feature_plus = feature_decoder + feature  # B  8 D H W
+
+        x = self.prob(feature_plus)
+
+        return x.squeeze(1)
+
+
+# --------------------------------------------------------------
+
+
+class BasicBlockGeo(nn.Module):
+    expansion = 1
+    __constants__ = ['downsample']
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=None,
+                 geoplanes=3):
+        super(BasicBlockGeo, self).__init__()
+
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+
+        if groups != 1 or base_width != 64:
+            raise ValueError(
+                'BasicBlock only supports groups=1 and base_width=64')
+        if dilation > 1:
+            raise NotImplementedError(
+                'Dilation > 1 not supported in BasicBlock')
+
+        self.conv1 = conv3x3(inplanes + geoplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes + geoplanes, planes)
+        self.bn2 = norm_layer(planes)
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                conv1x1(inplanes + geoplanes, planes, stride),
+                norm_layer(planes),
+            )
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x, g1=None, g2=None):
+        identity = x
+        if g1 is not None:
+            x = torch.cat((x, g1), 1)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        if g2 is not None:
+            out = torch.cat((g2, out), 1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class GeometryFeature(nn.Module):
+
+    def __init__(self):
+        super(GeometryFeature, self).__init__()
+
+    def forward(self, z, vnorm, unorm, h, w, ch, cw, fh, fw):
+        x = z * (0.5 * h * (vnorm + 1) - ch) / fh
+        y = z * (0.5 * w * (unorm + 1) - cw) / fw
+        return torch.cat((x, y, z), 1)
+
+
+class SparseDownSampleClose(nn.Module):
+
+    def __init__(self, stride):
+        super(SparseDownSampleClose, self).__init__()
+        self.pooling = nn.MaxPool2d(stride, stride)
+        self.large_number = 600
+
+    def forward(self, d, mask):
+        encode_d = -(1 - mask) * self.large_number - d
+
+        d = -self.pooling(encode_d)
+        mask_result = self.pooling(mask)
+        d_result = d - (1 - mask_result) * self.large_number
+
+        return d_result, mask_result
+
+
+def convbnrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True))
+
+
+def deconvbnrelu(in_channels,
+                 out_channels,
+                 kernel_size=5,
+                 stride=2,
+                 padding=2,
+                 output_padding=1):
+    return nn.Sequential(
+        nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True))
+
+
+def weights_init(m):
+    """Initialize filters with Gaussian random weights"""
+    if isinstance(m, nn.Conv2d):
+        n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.ConvTranspose2d):
+        n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels
+        m.weight.data.normal_(0, math.sqrt(2. / n))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1)
+        m.bias.data.zero_()
+
+
+def conv3x3(in_planes,
+            out_planes,
+            stride=1,
+            groups=1,
+            dilation=1,
+            bias=False,
+            padding=1):
+    """3x3 convolution with padding"""
+    if padding >= 1:
+        padding = dilation
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=padding,
+        groups=groups,
+        bias=bias,
+        dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1, groups=1, bias=False):
+    """1x1 convolution"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=1,
+        stride=stride,
+        groups=groups,
+        bias=bias)
+
+
+class AddCoordsNp():
+    """Add coords to a tensor"""
+
+    def __init__(self, x_dim=64, y_dim=64, with_r=False):
+        self.x_dim = x_dim
+        self.y_dim = y_dim
+        self.with_r = with_r
+
+    def call(self):
+        """
+        input_tensor: (batch, x_dim, y_dim, c)
+        """
+        xx_ones = np.ones([self.x_dim], dtype=np.int32)
+        xx_ones = np.expand_dims(xx_ones, 1)
+
+        xx_range = np.expand_dims(np.arange(self.y_dim), 0)
+
+        xx_channel = np.matmul(xx_ones, xx_range)
+        xx_channel = np.expand_dims(xx_channel, -1)
+
+        yy_ones = np.ones([self.y_dim], dtype=np.int32)
+        yy_ones = np.expand_dims(yy_ones, 0)
+
+        yy_range = np.expand_dims(np.arange(self.x_dim), 1)
+
+        yy_channel = np.matmul(yy_range, yy_ones)
+        yy_channel = np.expand_dims(yy_channel, -1)
+
+        xx_channel = xx_channel.astype('float32') / (self.y_dim - 1)
+        yy_channel = yy_channel.astype('float32') / (self.x_dim - 1)
+
+        xx_channel = xx_channel * 2 - 1
+        yy_channel = yy_channel * 2 - 1
+
+        ret = np.concatenate([xx_channel, yy_channel], axis=-1)
+
+        if self.with_r:
+            rr = np.sqrt(
+                np.square(xx_channel - 0.5) + np.square(yy_channel - 0.5))
+            ret = np.concatenate([ret, rr], axis=-1)
+
+        return ret
+
+
+# --------------------------------------------------------------
+
+
+class Reg_BasicBlockGeo(nn.Module):
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 kernel_size,
+                 stride,
+                 padding,
+                 downsample=None,
+                 groups=1,
+                 base_width=64,
+                 dilation=1,
+                 norm_layer=nn.BatchNorm3d,
+                 geoplanes=3):
+        super(Reg_BasicBlockGeo, self).__init__()
+
+        self.conv1 = regconv3D(
+            inplanes + geoplanes,
+            planes,
+            kernel_size=(1, 3, 3),
+            stride=1,
+            padding=(0, 1, 1))
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = regconv3D(planes + geoplanes, planes, kernel_size, stride,
+                               padding)
+        self.bn2 = norm_layer(planes)
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                regconv1x1(inplanes + geoplanes, planes, kernel_size, stride,
+                           padding),
+                norm_layer(planes),
+            )
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x, g1=None, g2=None):
+        identity = x
+        if g1 is not None:
+            x = torch.cat((x, g1), 1)
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        if g2 is not None:
+            out = torch.cat((g2, out), 1)
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+def regconv3D(in_planes,
+              out_planes,
+              kernel_size,
+              stride,
+              padding,
+              groups=1,
+              dilation=1,
+              bias=False):
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        groups=groups,
+        bias=bias,
+        dilation=dilation)
+
+
+def regconv1x1(in_planes,
+               out_planes,
+               kernel_size,
+               stride,
+               padding,
+               groups=1,
+               bias=False):
+    return nn.Conv3d(
+        in_planes,
+        out_planes,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        groups=groups,
+        bias=bias)
+
+
+def reg_deconvbnrelu(in_channels, out_channels, kernel_size, stride, padding,
+                     output_padding):
+    return nn.Sequential(
+        nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            output_padding=output_padding,
+            bias=False), nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True))
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geomvsnet.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geomvsnet.py
new file mode 100644
index 000000000..965401d75
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geomvsnet.py
@@ -0,0 +1,267 @@
+# @Description: Main network architecture for GeoMVSNet.
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .filter import frequency_domain_filter
+from .geometry import GeoFeatureFusion, GeoRegNet2d
+from .submodules import (FPN, Reg2d, homo_warping, init_inverse_range,
+                         schedule_inverse_range)
+
+
+class GeoMVSNet(nn.Module):
+
+    def __init__(self, levels, hypo_plane_num_stages,
+                 depth_interal_ratio_stages, feat_base_channel,
+                 reg_base_channel, group_cor_dim_stages):
+        super(GeoMVSNet, self).__init__()
+
+        self.levels = levels
+        self.hypo_plane_num_stages = hypo_plane_num_stages
+        self.depth_interal_ratio_stages = depth_interal_ratio_stages
+
+        self.StageNet = StageNet()
+
+        # feature settings
+        self.FeatureNet = FPN(base_channels=feat_base_channel)
+        self.coarest_separate_flag = True
+        if self.coarest_separate_flag:
+            self.CoarestFeatureNet = FPN(base_channels=feat_base_channel)
+        self.GeoFeatureFusionNet = GeoFeatureFusion(
+            convolutional_layer_encoding='z',
+            mask_type='basic',
+            add_origin_feat_flag=True)
+
+        # cost regularization settings
+        self.RegNet_stages = nn.ModuleList()
+        self.group_cor_dim_stages = group_cor_dim_stages
+        self.geo_reg_flag = True
+        self.geo_reg_encodings = ['std', 'z', 'z',
+                                  'z']  # must use std in idx-0
+        for stage_idx in range(self.levels):
+            in_dim = group_cor_dim_stages[stage_idx]
+            if self.geo_reg_flag:
+                self.RegNet_stages.append(
+                    GeoRegNet2d(
+                        input_channel=in_dim,
+                        base_channel=reg_base_channel,
+                        convolutional_layer_encoding=self.
+                        geo_reg_encodings[stage_idx]))
+            else:
+                self.RegNet_stages.append(
+                    Reg2d(input_channel=in_dim, base_channel=reg_base_channel))
+
+        # frequency domain filter settings
+        self.curriculum_learning_rho_ratios = [9, 4, 2, 1]
+
+    def forward(self,
+                imgs,
+                proj_matrices,
+                intrinsics_matrices,
+                depth_values,
+                filename=None):
+
+        features = []
+        if self.coarest_separate_flag:
+            coarsest_features = []
+        for nview_idx in range(len(imgs)):
+            img = imgs[nview_idx]
+            features.append(self.FeatureNet(img))  # B C H W
+            if self.coarest_separate_flag:
+                coarsest_features.append(self.CoarestFeatureNet(img))
+
+        # coarse-to-fine
+        outputs = {}
+        for stage_idx in range(self.levels):
+            stage_name = 'stage{}'.format(stage_idx + 1)
+            B, C, H, W = features[0][stage_name].shape
+            proj_matrices_stage = proj_matrices[stage_name]
+            intrinsics_matrices_stage = intrinsics_matrices[stage_name]
+
+            # @Note features
+            if stage_idx == 0:
+                if self.coarest_separate_flag:
+                    features_stage = [
+                        feat[stage_name] for feat in coarsest_features
+                    ]
+                else:
+                    features_stage = [feat[stage_name] for feat in features]
+            elif stage_idx >= 1:
+                features_stage = [feat[stage_name] for feat in features]
+
+                ref_img_stage = F.interpolate(
+                    imgs[0],
+                    size=None,
+                    scale_factor=1. / 2**(3 - stage_idx),
+                    mode='bilinear',
+                    align_corners=False)
+                depth_last = F.interpolate(
+                    depth_last.unsqueeze(1),
+                    size=None,
+                    scale_factor=2,
+                    mode='bilinear',
+                    align_corners=False)
+                confidence_last = F.interpolate(
+                    confidence_last.unsqueeze(1),
+                    size=None,
+                    scale_factor=2,
+                    mode='bilinear',
+                    align_corners=False)
+
+                # reference feature
+                features_stage[0] = self.GeoFeatureFusionNet(
+                    ref_img_stage, depth_last, confidence_last, depth_values,
+                    stage_idx, features_stage[0], intrinsics_matrices_stage)
+
+            # @Note depth hypos
+            if stage_idx == 0:
+                depth_hypo = init_inverse_range(
+                    depth_values, self.hypo_plane_num_stages[stage_idx],
+                    img[0].device, img[0].dtype, H, W)
+            else:
+                inverse_min_depth, inverse_max_depth = outputs_stage[
+                    'inverse_min_depth'].detach(), \
+                    outputs_stage['inverse_max_depth'].detach()
+                depth_hypo = schedule_inverse_range(
+                    inverse_min_depth, inverse_max_depth,
+                    self.hypo_plane_num_stages[stage_idx], H, W)  # B D H W
+
+            # @Note cost regularization
+            geo_reg_data = {}
+            if self.geo_reg_flag:
+                geo_reg_data['depth_values'] = depth_values
+                if stage_idx >= 1 and self.geo_reg_encodings[stage_idx] == 'z':
+                    prob_volume_last = F.interpolate(
+                        prob_volume_last,
+                        size=None,
+                        scale_factor=2,
+                        mode='bilinear',
+                        align_corners=False)
+                    geo_reg_data['prob_volume_last'] = prob_volume_last
+
+            outputs_stage = self.StageNet(
+                stage_idx,
+                features_stage,
+                proj_matrices_stage,
+                depth_hypo=depth_hypo,
+                regnet=self.RegNet_stages[stage_idx],
+                group_cor_dim=self.group_cor_dim_stages[stage_idx],
+                depth_interal_ratio=self.depth_interal_ratio_stages[stage_idx],
+                geo_reg_data=geo_reg_data)
+
+            # @Note frequency domain filter
+            depth_est = outputs_stage['depth']
+            depth_est_filtered = frequency_domain_filter(
+                depth_est,
+                rho_ratio=self.curriculum_learning_rho_ratios[stage_idx])
+            outputs_stage['depth_filtered'] = depth_est_filtered
+            depth_last = depth_est_filtered
+
+            confidence_last = outputs_stage['photometric_confidence']
+            prob_volume_last = outputs_stage['prob_volume']
+
+            outputs[stage_name] = outputs_stage
+            outputs.update(outputs_stage)
+
+        return outputs
+
+
+class StageNet(nn.Module):
+
+    def __init__(self, attn_temp=2):
+        super(StageNet, self).__init__()
+        self.attn_temp = attn_temp
+
+    def forward(self,
+                stage_idx,
+                features,
+                proj_matrices,
+                depth_hypo,
+                regnet,
+                group_cor_dim,
+                depth_interal_ratio,
+                geo_reg_data=None):
+
+        # @Note step1: feature extraction
+        proj_matrices = torch.unbind(proj_matrices, 1)
+        ref_feature, src_features = features[0], features[1:]
+        ref_proj, src_projs = proj_matrices[0], proj_matrices[1:]
+        B, D, H, W = depth_hypo.shape
+        C = ref_feature.shape[1]
+
+        # @Note step2: cost aggregation
+        ref_volume = ref_feature.unsqueeze(2).repeat(1, 1, D, 1, 1)
+        cor_weight_sum = 1e-8
+        cor_feats = 0
+        for src_idx, (src_fea,
+                      src_proj) in enumerate(zip(src_features, src_projs)):
+            # save_fn = None
+            src_proj_new = src_proj[:, 0].clone()
+            src_proj_new[:, :3, :4] = torch.matmul(src_proj[:, 1, :3, :3],
+                                                   src_proj[:, 0, :3, :4])
+            ref_proj_new = ref_proj[:, 0].clone()
+            ref_proj_new[:, :3, :4] = torch.matmul(ref_proj[:, 1, :3, :3],
+                                                   ref_proj[:, 0, :3, :4])
+            warped_src = homo_warping(src_fea, src_proj_new, ref_proj_new,
+                                      depth_hypo)  # B C D H W
+
+            warped_src = warped_src.reshape(B, group_cor_dim,
+                                            C // group_cor_dim, D, H, W)
+            ref_volume = ref_volume.reshape(B, group_cor_dim,
+                                            C // group_cor_dim, D, H, W)
+            cor_feat = (warped_src * ref_volume).mean(2)  # B G D H W
+            del warped_src, src_proj, src_fea
+
+            cor_weight = torch.softmax(cor_feat.sum(1) / self.attn_temp,
+                                       1) / math.sqrt(C)  # B D H W
+            cor_weight_sum += cor_weight  # B D H W
+            cor_feats += cor_weight.unsqueeze(1) * cor_feat  # B C D H W
+            del cor_weight, cor_feat
+
+        cost_volume = cor_feats / cor_weight_sum.unsqueeze(1)  # B C D H W
+        del cor_weight_sum, src_features
+
+        # @Note step3: cost regularization
+        if geo_reg_data == {}:
+            # basic
+            cost_reg = regnet(cost_volume)
+        else:
+            # probability volume geometry embedding
+            cost_reg = regnet(cost_volume, stage_idx, geo_reg_data)
+        del cost_volume
+        prob_volume = F.softmax(cost_reg, dim=1)  # B D H W
+
+        # @Note step4: depth regression
+        prob_max_indices = prob_volume.max(1, keepdim=True)[1]  # B 1 H W
+        depth = torch.gather(depth_hypo, 1,
+                             prob_max_indices).squeeze(1)  # B H W
+
+        with torch.no_grad():
+            photometric_confidence = prob_volume.max(1)[0]  # B H W
+            photometric_confidence = F.interpolate(
+                photometric_confidence.unsqueeze(1),
+                scale_factor=1,
+                mode='bilinear',
+                align_corners=True).squeeze(1)
+
+        last_depth_itv = 1. / depth_hypo[:, 2, :, :] - 1. / depth_hypo[:,
+                                                                       1, :, :]
+        inverse_min_depth = 1 / depth + depth_interal_ratio * last_depth_itv  # B H W
+        inverse_max_depth = 1 / depth - depth_interal_ratio * last_depth_itv  # B H W
+
+        output_stage = {
+            'depth': depth,
+            'photometric_confidence': photometric_confidence,
+            'depth_hypo': depth_hypo,
+            'prob_volume': prob_volume,
+            'inverse_min_depth': inverse_min_depth,
+            'inverse_max_depth': inverse_max_depth,
+        }
+        return output_stage
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/loss.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/loss.py
new file mode 100644
index 000000000..f2c811fb4
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/loss.py
@@ -0,0 +1,120 @@
+# @Description: Loss Functions (Sec 3.4 in the paper).
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import torch
+
+
+def geomvsnet_loss(inputs, depth_gt_ms, mask_ms, **kwargs):
+
+    stage_lw = kwargs.get('stage_lw', [1, 1, 1, 1])
+    depth_values = kwargs.get('depth_values')
+    depth_min, depth_max = depth_values[:, 0], depth_values[:, -1]
+
+    total_loss = torch.tensor(
+        0.0,
+        dtype=torch.float32,
+        device=mask_ms['stage1'].device,
+        requires_grad=False)
+    pw_loss_stages = []
+    dds_loss_stages = []
+    for stage_idx, (stage_inputs, stage_key) in enumerate([
+        (inputs[k], k) for k in inputs.keys() if 'stage' in k
+    ]):
+
+        depth = stage_inputs['depth_filtered']
+        prob_volume = stage_inputs['prob_volume']
+        depth_value = stage_inputs['depth_hypo']
+
+        depth_gt = depth_gt_ms[stage_key]
+        mask = mask_ms[stage_key] > 0.5
+
+        # pw loss
+        pw_loss = pixel_wise_loss(prob_volume, depth_gt, mask, depth_value)
+        pw_loss_stages.append(pw_loss)
+
+        # dds loss
+        dds_loss = depth_distribution_similarity_loss(depth, depth_gt, mask,
+                                                      depth_min, depth_max)
+        dds_loss_stages.append(dds_loss)
+
+        # total loss
+        lam1, lam2 = 0.8, 0.2
+        total_loss = total_loss + stage_lw[stage_idx] * (
+            lam1 * pw_loss + lam2 * dds_loss)
+
+    depth_pred = stage_inputs['depth']
+    depth_gt = depth_gt_ms[stage_key]
+    epe = cal_metrics(depth_pred, depth_gt, mask, depth_min, depth_max)
+
+    return total_loss, epe, pw_loss_stages, dds_loss_stages
+
+
+def pixel_wise_loss(prob_volume, depth_gt, mask, depth_value):
+    mask_true = mask
+    valid_pixel_num = torch.sum(mask_true, dim=[1, 2]) + 1e-12
+
+    shape = depth_gt.shape
+
+    depth_num = depth_value.shape[1]
+    depth_value_mat = depth_value
+
+    gt_index_image = torch.argmin(
+        torch.abs(depth_value_mat - depth_gt.unsqueeze(1)), dim=1)
+
+    gt_index_image = torch.mul(mask_true, gt_index_image.type(torch.float))
+    gt_index_image = torch.round(gt_index_image).type(torch.long).unsqueeze(1)
+
+    gt_index_volume = torch.zeros(shape[0], depth_num, shape[1],
+                                  shape[2]).type(mask_true.type()).scatter_(
+                                      1, gt_index_image, 1)
+    cross_entropy_image = -torch.sum(
+        gt_index_volume * torch.log(prob_volume + 1e-12), dim=1).squeeze(1)
+    masked_cross_entropy_image = torch.mul(mask_true, cross_entropy_image)
+    masked_cross_entropy = torch.sum(masked_cross_entropy_image, dim=[1, 2])
+
+    masked_cross_entropy = torch.mean(masked_cross_entropy / valid_pixel_num)
+
+    pw_loss = masked_cross_entropy
+    return pw_loss
+
+
+def depth_distribution_similarity_loss(depth, depth_gt, mask, depth_min,
+                                       depth_max):
+    depth_norm = depth * 128 / (depth_max - depth_min)[:, None, None]
+    depth_gt_norm = depth_gt * 128 / (depth_max - depth_min)[:, None, None]
+
+    M_bins = 48
+    kl_min = torch.min(torch.min(depth_gt), depth.mean() - 3. * depth.std())
+    kl_max = torch.max(torch.max(depth_gt), depth.mean() + 3. * depth.std())
+    bins = torch.linspace(kl_min, kl_max, steps=M_bins)
+
+    kl_divs = []
+    for i in range(len(bins) - 1):
+        bin_mask = (depth_gt >= bins[i]) & (depth_gt < bins[i + 1])
+        merged_mask = mask & bin_mask
+
+        if merged_mask.sum() > 0:
+            p = depth_norm[merged_mask]
+            q = depth_gt_norm[merged_mask]
+            kl_div = torch.nn.functional.kl_div(
+                torch.log(p) - torch.log(q), p, reduction='batchmean')
+            kl_div = torch.log(kl_div)
+            kl_divs.append(kl_div)
+
+    dds_loss = sum(kl_divs)
+    return dds_loss
+
+
+def cal_metrics(depth_pred, depth_gt, mask, depth_min, depth_max):
+    depth_pred_norm = depth_pred * 128 / (depth_max - depth_min)[:, None, None]
+    depth_gt_norm = depth_gt * 128 / (depth_max - depth_min)[:, None, None]
+
+    abs_err = torch.abs(depth_pred_norm[mask] - depth_gt_norm[mask])
+    epe = abs_err.mean()
+    # err1 = (abs_err <= 1).float().mean() * 100
+    # err3 = (abs_err <= 3).float().mean() * 100
+
+    return epe  # err1, err3
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/submodules.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/submodules.py
new file mode 100644
index 000000000..8910ae3b3
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/submodules.py
@@ -0,0 +1,379 @@
+# @Description: Some sub-modules for the network.
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FPN(nn.Module):
+    """FPN aligncorners downsample 4x"""
+
+    def __init__(self, base_channels, gn=False):
+        super(FPN, self).__init__()
+        self.base_channels = base_channels
+
+        self.conv0 = nn.Sequential(
+            Conv2d(3, base_channels, 3, 1, padding=1, gn=gn),
+            Conv2d(base_channels, base_channels, 3, 1, padding=1, gn=gn),
+        )
+
+        self.conv1 = nn.Sequential(
+            Conv2d(
+                base_channels,
+                base_channels * 2,
+                5,
+                stride=2,
+                padding=2,
+                gn=gn),
+            Conv2d(
+                base_channels * 2, base_channels * 2, 3, 1, padding=1, gn=gn),
+            Conv2d(
+                base_channels * 2, base_channels * 2, 3, 1, padding=1, gn=gn),
+        )
+
+        self.conv2 = nn.Sequential(
+            Conv2d(
+                base_channels * 2,
+                base_channels * 4,
+                5,
+                stride=2,
+                padding=2,
+                gn=gn),
+            Conv2d(
+                base_channels * 4, base_channels * 4, 3, 1, padding=1, gn=gn),
+            Conv2d(
+                base_channels * 4, base_channels * 4, 3, 1, padding=1, gn=gn),
+        )
+
+        self.conv3 = nn.Sequential(
+            Conv2d(
+                base_channels * 4,
+                base_channels * 8,
+                5,
+                stride=2,
+                padding=2,
+                gn=gn),
+            Conv2d(
+                base_channels * 8, base_channels * 8, 3, 1, padding=1, gn=gn),
+            Conv2d(
+                base_channels * 8, base_channels * 8, 3, 1, padding=1, gn=gn),
+        )
+
+        self.out_channels = [8 * base_channels]
+        final_chs = base_channels * 8
+
+        self.inner1 = nn.Conv2d(base_channels * 4, final_chs, 1, bias=True)
+        self.inner2 = nn.Conv2d(base_channels * 2, final_chs, 1, bias=True)
+        self.inner3 = nn.Conv2d(base_channels * 1, final_chs, 1, bias=True)
+
+        self.out1 = nn.Conv2d(final_chs, base_channels * 8, 1, bias=False)
+        self.out2 = nn.Conv2d(
+            final_chs, base_channels * 4, 3, padding=1, bias=False)
+        self.out3 = nn.Conv2d(
+            final_chs, base_channels * 2, 3, padding=1, bias=False)
+        self.out4 = nn.Conv2d(
+            final_chs, base_channels, 3, padding=1, bias=False)
+
+        self.out_channels.append(base_channels * 4)
+        self.out_channels.append(base_channels * 2)
+        self.out_channels.append(base_channels)
+
+    def forward(self, x):
+        conv0 = self.conv0(x)
+        conv1 = self.conv1(conv0)
+        conv2 = self.conv2(conv1)
+        conv3 = self.conv3(conv2)
+
+        intra_feat = conv3
+        outputs = {}
+        out1 = self.out1(intra_feat)
+
+        intra_feat = F.interpolate(
+            intra_feat, scale_factor=2, mode='bilinear',
+            align_corners=True) + self.inner1(conv2)
+        out2 = self.out2(intra_feat)
+
+        intra_feat = F.interpolate(
+            intra_feat, scale_factor=2, mode='bilinear',
+            align_corners=True) + self.inner2(conv1)
+        out3 = self.out3(intra_feat)
+
+        intra_feat = F.interpolate(
+            intra_feat, scale_factor=2, mode='bilinear',
+            align_corners=True) + self.inner3(conv0)
+        out4 = self.out4(intra_feat)
+
+        outputs['stage1'] = out1
+        outputs['stage2'] = out2
+        outputs['stage3'] = out3
+        outputs['stage4'] = out4
+
+        return outputs
+
+
+class Reg2d(nn.Module):
+
+    def __init__(self, input_channel=128, base_channel=32):
+        super(Reg2d, self).__init__()
+
+        self.conv0 = ConvBnReLU3D(
+            input_channel, base_channel, kernel_size=(1, 3, 3), pad=(0, 1, 1))
+        self.conv1 = ConvBnReLU3D(
+            base_channel,
+            base_channel * 2,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            pad=(0, 1, 1))
+        self.conv2 = ConvBnReLU3D(base_channel * 2, base_channel * 2)
+
+        self.conv3 = ConvBnReLU3D(
+            base_channel * 2,
+            base_channel * 4,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            pad=(0, 1, 1))
+        self.conv4 = ConvBnReLU3D(base_channel * 4, base_channel * 4)
+
+        self.conv5 = ConvBnReLU3D(
+            base_channel * 4,
+            base_channel * 8,
+            kernel_size=(1, 3, 3),
+            stride=(1, 2, 2),
+            pad=(0, 1, 1))
+        self.conv6 = ConvBnReLU3D(base_channel * 8, base_channel * 8)
+
+        self.conv7 = nn.Sequential(
+            nn.ConvTranspose3d(
+                base_channel * 8,
+                base_channel * 4,
+                kernel_size=(1, 3, 3),
+                padding=(0, 1, 1),
+                output_padding=(0, 1, 1),
+                stride=(1, 2, 2),
+                bias=False), nn.BatchNorm3d(base_channel * 4),
+            nn.ReLU(inplace=True))
+
+        self.conv9 = nn.Sequential(
+            nn.ConvTranspose3d(
+                base_channel * 4,
+                base_channel * 2,
+                kernel_size=(1, 3, 3),
+                padding=(0, 1, 1),
+                output_padding=(0, 1, 1),
+                stride=(1, 2, 2),
+                bias=False), nn.BatchNorm3d(base_channel * 2),
+            nn.ReLU(inplace=True))
+
+        self.conv11 = nn.Sequential(
+            nn.ConvTranspose3d(
+                base_channel * 2,
+                base_channel,
+                kernel_size=(1, 3, 3),
+                padding=(0, 1, 1),
+                output_padding=(0, 1, 1),
+                stride=(1, 2, 2),
+                bias=False), nn.BatchNorm3d(base_channel),
+            nn.ReLU(inplace=True))
+
+        self.prob = nn.Conv3d(8, 1, 1, stride=1, padding=0)
+
+    def forward(self, x):
+        conv0 = self.conv0(x)
+        conv2 = self.conv2(self.conv1(conv0))
+        conv4 = self.conv4(self.conv3(conv2))
+        x = self.conv6(self.conv5(conv4))
+        x = conv4 + self.conv7(x)
+        x = conv2 + self.conv9(x)
+        x = conv0 + self.conv11(x)
+        x = self.prob(x)
+
+        return x.squeeze(1)
+
+
+def homo_warping(src_fea, src_proj, ref_proj, depth_values):
+    # src_fea: [B, C, H, W]
+    # src_proj: [B, 4, 4]
+    # ref_proj: [B, 4, 4]
+    # depth_values: [B, Ndepth] o [B, Ndepth, H, W]
+    # out: [B, C, Ndepth, H, W]
+    C = src_fea.shape[1]
+    Hs, Ws = src_fea.shape[-2:]
+    B, num_depth, Hr, Wr = depth_values.shape
+
+    with torch.no_grad():
+        proj = torch.matmul(src_proj, torch.inverse(ref_proj))
+        rot = proj[:, :3, :3]  # [B,3,3]
+        trans = proj[:, :3, 3:4]  # [B,3,1]
+
+        y, x = torch.meshgrid([
+            torch.arange(0, Hr, dtype=torch.float32, device=src_fea.device),
+            torch.arange(0, Wr, dtype=torch.float32, device=src_fea.device)
+        ])
+        y = y.reshape(Hr * Wr)
+        x = x.reshape(Hr * Wr)
+        xyz = torch.stack((x, y, torch.ones_like(x)))  # [3, H*W]
+        xyz = torch.unsqueeze(xyz, 0).repeat(B, 1, 1)  # [B, 3, H*W]
+        rot_xyz = torch.matmul(rot, xyz)  # [B, 3, H*W]
+        rot_depth_xyz = rot_xyz.unsqueeze(2).repeat(
+            1, 1, num_depth, 1) * depth_values.reshape(
+                B, 1, num_depth, -1)  # [B, 3, Ndepth, H*W]
+        proj_xyz = rot_depth_xyz + trans.reshape(B, 3, 1,
+                                                 1)  # [B, 3, Ndepth, H*W]
+        # FIXME divide 0
+        temp = proj_xyz[:, 2:3, :, :]
+        temp[temp == 0] = 1e-9
+        proj_xy = proj_xyz[:, :2, :, :] / temp  # [B, 2, Ndepth, H*W]
+        # proj_xy = proj_xyz[:, :2, :, :] / proj_xyz[:, 2:3, :, :]  # [B, 2, Ndepth, H*W]
+
+        proj_x_normalized = proj_xy[:, 0, :, :] / ((Ws - 1) / 2) - 1
+        proj_y_normalized = proj_xy[:, 1, :, :] / ((Hs - 1) / 2) - 1
+        proj_xy = torch.stack((proj_x_normalized, proj_y_normalized),
+                              dim=3)  # [B, Ndepth, H*W, 2]
+        grid = proj_xy
+    if len(src_fea.shape) == 4:
+        warped_src_fea = F.grid_sample(
+            src_fea,
+            grid.reshape(B, num_depth * Hr, Wr, 2),
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True)
+        warped_src_fea = warped_src_fea.reshape(B, C, num_depth, Hr, Wr)
+    elif len(src_fea.shape) == 5:
+        warped_src_fea = []
+        for d in range(src_fea.shape[2]):
+            warped_src_fea.append(
+                F.grid_sample(
+                    src_fea[:, :, d],
+                    grid.reshape(B, num_depth, Hr, Wr, 2)[:, d],
+                    mode='bilinear',
+                    padding_mode='zeros',
+                    align_corners=True))
+        warped_src_fea = torch.stack(warped_src_fea, dim=2)
+
+    return warped_src_fea
+
+
+def init_inverse_range(cur_depth, ndepths, device, dtype, H, W):
+    inverse_depth_min = 1. / cur_depth[:, 0]  # (B,)
+    inverse_depth_max = 1. / cur_depth[:, -1]
+    itv = torch.arange(
+        0, ndepths, device=device, dtype=dtype, requires_grad=False).reshape(
+            1, -1, 1, 1).repeat(1, 1, H, W) / (ndepths - 1)  # 1 D H W
+    inverse_depth_hypo = inverse_depth_max[:, None, None, None] + (
+        inverse_depth_min - inverse_depth_max)[:, None, None, None] * itv
+
+    return 1. / inverse_depth_hypo
+
+
+def schedule_inverse_range(inverse_min_depth, inverse_max_depth, ndepths, H,
+                           W):
+    # cur_depth_min, (B, H, W)
+    # cur_depth_max: (B, H, W)
+    itv = torch.arange(
+        0,
+        ndepths,
+        device=inverse_min_depth.device,
+        dtype=inverse_min_depth.dtype,
+        requires_grad=False).reshape(1, -1, 1, 1).repeat(
+            1, 1, H // 2, W // 2) / (ndepths - 1)  # 1 D H W
+
+    inverse_depth_hypo = inverse_max_depth[:, None, :, :] + (
+        inverse_min_depth - inverse_max_depth)[:, None, :, :] * itv  # B D H W
+    inverse_depth_hypo = F.interpolate(
+        inverse_depth_hypo.unsqueeze(1), [ndepths, H, W],
+        mode='trilinear',
+        align_corners=True).squeeze(1)
+    return 1. / inverse_depth_hypo
+
+
+# --------------------------------------------------------------
+
+
+def init_bn(module):
+    if module.weight is not None:
+        nn.init.ones_(module.weight)
+    if module.bias is not None:
+        nn.init.zeros_(module.bias)
+    return
+
+
+def init_uniform(module, init_method):
+    if module.weight is not None:
+        if init_method == 'kaiming':
+            nn.init.kaiming_uniform_(module.weight)
+        elif init_method == 'xavier':
+            nn.init.xavier_uniform_(module.weight)
+    return
+
+
+class ConvBnReLU3D(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 pad=1):
+        super(ConvBnReLU3D, self).__init__()
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=pad,
+            bias=False)
+        self.bn = nn.BatchNorm3d(out_channels)
+
+    def forward(self, x):
+        return F.relu(self.bn(self.conv(x)), inplace=True)
+
+
+class Conv2d(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 relu=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 gn=False,
+                 group_channel=8,
+                 **kwargs):
+        super(Conv2d, self).__init__()
+        bn = not gn
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.bn = nn.BatchNorm2d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.gn = nn.GroupNorm(
+            int(max(1, out_channels
+                    / group_channel)), out_channels) if gn else None
+        self.relu = relu
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        else:
+            x = self.gn(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/__init__.py
new file mode 100644
index 000000000..16281fe0b
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/__init__.py
@@ -0,0 +1 @@
+from .utils import *
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/opts.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/opts.py
new file mode 100644
index 000000000..e6921f55f
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/opts.py
@@ -0,0 +1,148 @@
+# @Description: Options settings & configurations for GeoMVSNet.
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import argparse
+
+
+def get_opts():
+    parser = argparse.ArgumentParser(description='args')
+
+    # global settings
+    parser.add_argument(
+        '--mode',
+        default='train',
+        help='train or test',
+        choices=['train', 'test', 'val'])
+    parser.add_argument(
+        '--which_dataset',
+        default='dtu',
+        choices=['dtu', 'tnt', 'blendedmvs', 'general'],
+        help='which dataset for using')
+
+    parser.add_argument('--n_views', type=int, default=5, help='num of view')
+    parser.add_argument('--levels', type=int, default=4, help='num of stages')
+    parser.add_argument(
+        '--hypo_plane_num_stages',
+        type=str,
+        default='8,8,4,4',
+        help='num of hypothesis planes for each stage')
+    parser.add_argument(
+        '--depth_interal_ratio_stages',
+        type=str,
+        default='0.5,0.5,0.5,1',
+        help='depth interals for each stage')
+    parser.add_argument(
+        '--feat_base_channel',
+        type=int,
+        default=8,
+        help='channel num for base feature')
+    parser.add_argument(
+        '--reg_base_channel',
+        type=int,
+        default=8,
+        help='channel num for regularization')
+    parser.add_argument(
+        '--group_cor_dim_stages',
+        type=str,
+        default='8,8,4,4',
+        help='group correlation dim')
+
+    parser.add_argument(
+        '--batch_size', type=int, default=1, help='batch size for training')
+    parser.add_argument(
+        '--data_scale',
+        type=str,
+        choices=['mid', 'raw'],
+        help='use mid or raw resolution')
+    parser.add_argument('--trainpath', help='data path for training')
+    parser.add_argument('--testpath', help='data path for testing')
+    parser.add_argument('--trainlist', help='data list for training')
+    parser.add_argument('--testlist', nargs='+', help='data list for testing')
+
+    # training config
+    parser.add_argument(
+        '--stage_lw',
+        type=str,
+        default='1,1,1,1',
+        help='loss weight for different stages')
+
+    parser.add_argument(
+        '--epochs', type=int, default=10, help='number of epochs to train')
+    parser.add_argument(
+        '--lr_scheduler',
+        type=str,
+        default='MS',
+        help='scheduler for learning rate')
+    parser.add_argument(
+        '--lr', type=float, default=0.001, help='learning rate')
+    parser.add_argument(
+        '--lrepochs',
+        type=str,
+        default='1,3,5,7,9,11,13,15:1.5',
+        help='epoch ids to downscale lr and the downscale rate')
+    parser.add_argument('--wd', type=float, default=0.0, help='weight decay')
+
+    parser.add_argument(
+        '--summary_freq',
+        type=int,
+        default=100,
+        help='print and summary frequency')
+    parser.add_argument(
+        '--save_freq', type=int, default=1, help='save checkpoint frequency')
+    parser.add_argument(
+        '--eval_freq', type=int, default=1, help='eval frequency')
+
+    parser.add_argument(
+        '--robust_train', action='store_true', help='robust training')
+
+    # testing config
+    parser.add_argument(
+        '--split',
+        type=str,
+        choices=['intermediate', 'advanced'],
+        help='intermediate|advanced for tanksandtemples')
+    parser.add_argument(
+        '--img_mode',
+        type=str,
+        default='resize',
+        choices=['resize', 'crop'],
+        help='image resolution matching strategy for TNT dataset')
+    parser.add_argument(
+        '--cam_mode',
+        type=str,
+        default='origin',
+        choices=['origin', 'short_range'],
+        help='camera parameter strategy for TNT dataset')
+
+    parser.add_argument(
+        '--loadckpt', default=None, help='load a specific checkpoint')
+    parser.add_argument(
+        '--logdir',
+        default='./checkpoints/debug',
+        help='the directory to save checkpoints/logs')
+    parser.add_argument(
+        '--nolog', action='store_true', help='do not log into .log file')
+    parser.add_argument(
+        '--notensorboard',
+        action='store_true',
+        help='do not log into tensorboard')
+    parser.add_argument(
+        '--save_conf_all_stages',
+        action='store_true',
+        help='save confidence maps for all stages')
+    parser.add_argument('--outdir', default='./outputs', help='output dir')
+    parser.add_argument(
+        '--resume', action='store_true', help='continue to train the model')
+
+    # pytorch config
+    parser.add_argument('--device', default='cuda', help='device to use')
+    parser.add_argument(
+        '--seed', type=int, default=1, metavar='S', help='random seed')
+    parser.add_argument(
+        '--pin_m', action='store_true', help='data loader pin memory')
+    parser.add_argument('--local_rank', type=int, default=0)
+
+    return parser.parse_args()
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/utils.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/utils.py
new file mode 100644
index 000000000..fe44862c5
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/utils.py
@@ -0,0 +1,269 @@
+# @Description: Some useful utils.
+# @Author: Zhe Zhang (doublez@stu.pku.edu.cn)
+# @Affiliation: Peking University (PKU)
+# @LastEditDate: 2023-09-07
+# @https://github.com/doublez0108/geomvsnet
+
+import random
+from bisect import bisect_right
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torchvision.utils as vutils
+
+
+# torch.no_grad warpper for functions
+def make_nograd_func(func):
+
+    def wrapper(*f_args, **f_kwargs):
+        with torch.no_grad():
+            ret = func(*f_args, **f_kwargs)
+        return ret
+
+    return wrapper
+
+
+# convert a function into recursive style to handle nested dict/list/tuple variables
+def make_recursive_func(func):
+
+    def wrapper(vars):
+        if isinstance(vars, list):
+            return [wrapper(x) for x in vars]
+        elif isinstance(vars, tuple):
+            return tuple([wrapper(x) for x in vars])
+        elif isinstance(vars, dict):
+            return {k: wrapper(v) for k, v in vars.items()}
+        else:
+            return func(vars)
+
+    return wrapper
+
+
+@make_recursive_func
+def tensor2float(vars):
+    if isinstance(vars, float):
+        return vars
+    elif isinstance(vars, torch.Tensor):
+        return vars.data.item()
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2float'.format(type(vars)))
+
+
+@make_recursive_func
+def tensor2numpy(vars):
+    if isinstance(vars, np.ndarray):
+        return vars
+    elif isinstance(vars, torch.Tensor):
+        return vars.detach().cpu().numpy().copy()
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2numpy'.format(type(vars)))
+
+
+@make_recursive_func
+def tocuda(vars):
+    if isinstance(vars, torch.Tensor):
+        return vars.to(torch.device('cuda'))
+    elif isinstance(vars, str):
+        return vars
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2numpy'.format(type(vars)))
+
+
+def tb_save_scalars(logger, mode, scalar_dict, global_step):
+    scalar_dict = tensor2float(scalar_dict)
+    for key, value in scalar_dict.items():
+        if not isinstance(value, (list, tuple)):
+            name = '{}/{}'.format(mode, key)
+            logger.add_scalar(name, value, global_step)
+        else:
+            for idx in range(len(value)):
+                name = '{}/{}_{}'.format(mode, key, idx)
+                logger.add_scalar(name, value[idx], global_step)
+
+
+def tb_save_images(logger, mode, images_dict, global_step):
+    images_dict = tensor2numpy(images_dict)
+
+    def preprocess(name, img):
+        if not (len(img.shape) == 3 or len(img.shape) == 4):
+            raise NotImplementedError(
+                'invalid img shape {}:{} in save_images'.format(
+                    name, img.shape))
+        if len(img.shape) == 3:
+            img = img[:, np.newaxis, :, :]
+        img = torch.from_numpy(img[:1])
+        return vutils.make_grid(
+            img, padding=0, nrow=1, normalize=True, scale_each=True)
+
+    for key, value in images_dict.items():
+        if not isinstance(value, (list, tuple)):
+            name = '{}/{}'.format(mode, key)
+            logger.add_image(name, preprocess(name, value), global_step)
+        else:
+            for idx in range(len(value)):
+                name = '{}/{}_{}'.format(mode, key, idx)
+                logger.add_image(name, preprocess(name, value[idx]),
+                                 global_step)
+
+
+class DictAverageMeter(object):
+
+    def __init__(self):
+        self.data = {}
+        self.count = 0
+
+    def update(self, new_input):
+        self.count += 1
+        if len(self.data) == 0:
+            for k, v in new_input.items():
+                if not isinstance(v, float):
+                    raise NotImplementedError('invalid data {}: {}'.format(
+                        k, type(v)))
+                self.data[k] = v
+        else:
+            for k, v in new_input.items():
+                if not isinstance(v, float):
+                    raise NotImplementedError('invalid data {}: {}'.format(
+                        k, type(v)))
+                self.data[k] += v
+
+    def mean(self):
+        return {k: v / self.count for k, v in self.data.items()}
+
+
+# a wrapper to compute metrics for each image individually
+def compute_metrics_for_each_image(metric_func):
+
+    def wrapper(depth_est, depth_gt, mask, *args):
+        batch_size = depth_gt.shape[0]
+        results = []
+        # compute result one by one
+        for idx in range(batch_size):
+            ret = metric_func(depth_est[idx], depth_gt[idx], mask[idx], *args)
+            results.append(ret)
+        return torch.stack(results).mean()
+
+    return wrapper
+
+
+@make_nograd_func
+@compute_metrics_for_each_image
+def Thres_metrics(depth_est, depth_gt, mask, thres):
+    assert isinstance(thres, (int, float))
+    depth_est, depth_gt = depth_est[mask], depth_gt[mask]
+    errors = torch.abs(depth_est - depth_gt)
+    err_mask = errors > thres
+    return torch.mean(err_mask.float())
+
+
+# NOTE: please do not use this to build up training loss
+@make_nograd_func
+@compute_metrics_for_each_image
+def AbsDepthError_metrics(depth_est, depth_gt, mask, thres=None):
+    depth_est, depth_gt = depth_est[mask], depth_gt[mask]
+    error = (depth_est - depth_gt).abs()
+    if thres is not None:
+        error = error[(error >= float(thres[0])) & (error <= float(thres[1]))]
+        if error.shape[0] == 0:
+            return torch.tensor(0, device=error.device, dtype=error.dtype)
+    return torch.mean(error)
+
+
+def synchronize():
+    """
+    Helper function to synchronize (barrier) among all processes when
+    using distributed training
+    """
+    if not dist.is_available():
+        return
+    if not dist.is_initialized():
+        return
+    world_size = dist.get_world_size()
+    if world_size == 1:
+        return
+    dist.barrier()
+
+
+def get_world_size():
+    if not dist.is_available():
+        return 1
+    if not dist.is_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def reduce_scalar_outputs(scalar_outputs):
+    world_size = get_world_size()
+    if world_size < 2:
+        return scalar_outputs
+    with torch.no_grad():
+        names = []
+        scalars = []
+        for k in sorted(scalar_outputs.keys()):
+            names.append(k)
+            scalars.append(scalar_outputs[k])
+        scalars = torch.stack(scalars, dim=0)
+        dist.reduce(scalars, dst=0)
+        if dist.get_rank() == 0:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            scalars /= world_size
+        reduced_scalars = {k: v for k, v in zip(names, scalars)}
+
+    return reduced_scalars
+
+
+class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
+
+    def __init__(
+        self,
+        optimizer,
+        milestones,
+        gamma=0.1,
+        warmup_factor=1.0 / 3,
+        warmup_iters=500,
+        warmup_method='linear',
+        last_epoch=-1,
+    ):
+        if not list(milestones) == sorted(milestones):
+            raise ValueError(
+                'Milestones should be a list of'
+                ' increasing integers. Got {}',
+                milestones,
+            )
+
+        if warmup_method not in ('constant', 'linear'):
+            raise ValueError(
+                "Only 'constant' or 'linear' warmup_method accepted"
+                'got {}'.format(warmup_method))
+        self.milestones = milestones
+        self.gamma = gamma
+        self.warmup_factor = warmup_factor
+        self.warmup_iters = warmup_iters
+        self.warmup_method = warmup_method
+        super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        warmup_factor = 1
+        if self.last_epoch < self.warmup_iters:
+            if self.warmup_method == 'constant':
+                warmup_factor = self.warmup_factor
+            elif self.warmup_method == 'linear':
+                alpha = float(self.last_epoch) / self.warmup_iters
+                warmup_factor = self.warmup_factor * (1 - alpha) + alpha
+        return [
+            base_lr * warmup_factor
+            * self.gamma**bisect_right(self.milestones, self.last_epoch)
+            for base_lr in self.base_lrs
+        ]
+
+
+def set_random_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/module.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/module.py
new file mode 100644
index 000000000..2ffda232c
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/module.py
@@ -0,0 +1,678 @@
+# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def init_bn(module):
+    if module.weight is not None:
+        nn.init.ones_(module.weight)
+    if module.bias is not None:
+        nn.init.zeros_(module.bias)
+    return
+
+
+def init_uniform(module, init_method):
+    if module.weight is not None:
+        if init_method == 'kaiming':
+            nn.init.kaiming_uniform_(module.weight)
+        elif init_method == 'xavier':
+            nn.init.xavier_uniform_(module.weight)
+    return
+
+
+class Conv2d(nn.Module):
+    """Applies a 2D convolution (optionally with batch normalization and relu activation)
+    over an input signal composed of several input planes.
+
+    Attributes:
+        conv (nn.Module): convolution module
+        bn (nn.Module): batch normalization module
+        relu (bool): whether to activate by relu
+
+    Notes:
+        Default momentum for batch normalization is set to be 0.01,
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Conv2d, self).__init__()
+
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.bn = nn.BatchNorm2d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class Deconv2d(nn.Module):
+    """Applies a 2D deconvolution (optionally with batch normalization and relu activation)
+       over an input signal composed of several input planes.
+
+       Attributes:
+           conv (nn.Module): convolution module
+           bn (nn.Module): batch normalization module
+           relu (bool): whether to activate by relu
+
+       Notes:
+           Default momentum for batch normalization is set to be 0.01,
+
+       """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Deconv2d, self).__init__()
+        self.out_channels = out_channels
+        assert stride in [1, 2]
+        self.stride = stride
+
+        self.conv = nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.bn = nn.BatchNorm2d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        y = self.conv(x)
+        if self.stride == 2:
+            h, w = list(x.size())[2:]
+            y = y[:, :, :2 * h, :2 * w].contiguous()
+        if self.bn is not None:
+            x = self.bn(y)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class Conv3d(nn.Module):
+    """Applies a 3D convolution (optionally with batch normalization and relu activation)
+    over an input signal composed of several input planes.
+
+    Attributes:
+        conv (nn.Module): convolution module
+        bn (nn.Module): batch normalization module
+        relu (bool): whether to activate by relu
+
+    Notes:
+        Default momentum for batch normalization is set to be 0.01,
+
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Conv3d, self).__init__()
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        assert stride in [1, 2]
+        self.stride = stride
+
+        self.conv = nn.Conv3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.bn = nn.BatchNorm3d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class Deconv3d(nn.Module):
+    """Applies a 3D deconvolution (optionally with batch normalization and relu activation)
+       over an input signal composed of several input planes.
+
+       Attributes:
+           conv (nn.Module): convolution module
+           bn (nn.Module): batch normalization module
+           relu (bool): whether to activate by relu
+
+       Notes:
+           Default momentum for batch normalization is set to be 0.01,
+
+       """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1,
+                 init_method='xavier',
+                 **kwargs):
+        super(Deconv3d, self).__init__()
+        self.out_channels = out_channels
+        assert stride in [1, 2]
+        self.stride = stride
+
+        self.conv = nn.ConvTranspose3d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            bias=(not bn),
+            **kwargs)
+        self.bn = nn.BatchNorm3d(
+            out_channels, momentum=bn_momentum) if bn else None
+        self.relu = relu
+
+    def forward(self, x):
+        y = self.conv(x)
+        if self.bn is not None:
+            x = self.bn(y)
+        if self.relu:
+            x = F.relu(x, inplace=True)
+        return x
+
+    def init_weights(self, init_method):
+        """default initialization"""
+        init_uniform(self.conv, init_method)
+        if self.bn is not None:
+            init_bn(self.bn)
+
+
+class ConvBnReLU(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 pad=1):
+        super(ConvBnReLU, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=pad,
+            bias=False)
+        self.bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        return F.relu(self.bn(self.conv(x)), inplace=True)
+
+
+class ConvBn(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 pad=1):
+        super(ConvBn, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=pad,
+            bias=False)
+        self.bn = nn.BatchNorm2d(out_channels)
+
+    def forward(self, x):
+        return self.bn(self.conv(x))
+
+
+def homo_warping(src_fea, src_proj, ref_proj, depth_values):
+    """
+        src_fea: [B, C, H, W]
+        src_proj: [B, 4, 4]
+        ref_proj: [B, 4, 4]
+        depth_values: [B, Ndepth] o [B, Ndepth, H, W]
+        out: [B, C, Ndepth, H, W]
+    """
+    batch, channels = src_fea.shape[0], src_fea.shape[1]
+    num_depth = depth_values.shape[1]
+    height, width = src_fea.shape[2], src_fea.shape[3]
+
+    with torch.no_grad():
+        proj = torch.matmul(src_proj, torch.inverse(ref_proj))
+        rot = proj[:, :3, :3]  # [B,3,3]
+        trans = proj[:, :3, 3:4]  # [B,3,1]
+
+        y, x = torch.meshgrid([
+            torch.arange(
+                0, height, dtype=torch.float32, device=src_fea.device),
+            torch.arange(0, width, dtype=torch.float32, device=src_fea.device)
+        ])
+        y, x = y.contiguous(), x.contiguous()
+        y, x = y.view(height * width), x.view(height * width)
+        xyz = torch.stack((x, y, torch.ones_like(x)))  # [3, H*W]
+        xyz = torch.unsqueeze(xyz, 0).repeat(batch, 1, 1)  # [B, 3, H*W]
+        rot_xyz = torch.matmul(rot, xyz)  # [B, 3, H*W]
+        rot_depth_xyz = rot_xyz.unsqueeze(2).repeat(
+            1, 1, num_depth, 1) * depth_values.view(batch, 1, num_depth,
+                                                    -1)  # [B, 3, Ndepth, H*W]
+        proj_xyz = rot_depth_xyz + trans.view(batch, 3, 1,
+                                              1)  # [B, 3, Ndepth, H*W]
+        proj_xy = proj_xyz[:, :
+                           2, :, :] / proj_xyz[:, 2:
+                                               3, :, :]  # [B, 2, Ndepth, H*W]
+        proj_x_normalized = proj_xy[:, 0, :, :] / ((width - 1) / 2) - 1
+        proj_y_normalized = proj_xy[:, 1, :, :] / ((height - 1) / 2) - 1
+        proj_xy = torch.stack((proj_x_normalized, proj_y_normalized),
+                              dim=3)  # [B, Ndepth, H*W, 2]
+        grid = proj_xy
+
+    warped_src_fea = F.grid_sample(
+        src_fea,
+        grid.view(batch, num_depth * height, width, 2),
+        mode='bilinear',
+        padding_mode='zeros',
+        align_corners=True)
+    warped_src_fea = warped_src_fea.view(batch, channels, num_depth, height,
+                                         width)
+
+    return warped_src_fea
+
+
+class DeConv2dFuse(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 relu=True,
+                 bn=True,
+                 bn_momentum=0.1):
+        super(DeConv2dFuse, self).__init__()
+
+        self.deconv = Deconv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=2,
+            padding=1,
+            output_padding=1,
+            bn=True,
+            relu=relu,
+            bn_momentum=bn_momentum)
+
+        self.conv = Conv2d(
+            2 * out_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=1,
+            bn=bn,
+            relu=relu,
+            bn_momentum=bn_momentum)
+
+    def forward(self, x_pre, x):
+        x = self.deconv(x)
+        x = torch.cat((x, x_pre), dim=1)
+        x = self.conv(x)
+        return x
+
+
+class FeatureNet(nn.Module):
+
+    def __init__(self, base_channels, num_stage=3, stride=4, arch_mode='unet'):
+        super(FeatureNet, self).__init__()
+        assert arch_mode in [
+            'unet', 'fpn'
+        ], f"mode must be in 'unet' or 'fpn', but get:{arch_mode}"
+        self.arch_mode = arch_mode
+        self.stride = stride
+        self.base_channels = base_channels
+        self.num_stage = num_stage
+
+        self.conv0 = nn.Sequential(
+            Conv2d(3, base_channels, 3, 1, padding=1),
+            Conv2d(base_channels, base_channels, 3, 1, padding=1),
+        )
+
+        self.conv1 = nn.Sequential(
+            Conv2d(base_channels, base_channels * 2, 5, stride=2, padding=2),
+            Conv2d(base_channels * 2, base_channels * 2, 3, 1, padding=1),
+            Conv2d(base_channels * 2, base_channels * 2, 3, 1, padding=1),
+        )
+
+        self.conv2 = nn.Sequential(
+            Conv2d(
+                base_channels * 2, base_channels * 4, 5, stride=2, padding=2),
+            Conv2d(base_channels * 4, base_channels * 4, 3, 1, padding=1),
+            Conv2d(base_channels * 4, base_channels * 4, 3, 1, padding=1),
+        )
+
+        self.out1 = nn.Conv2d(
+            base_channels * 4, base_channels * 4, 1, bias=False)
+        self.out_channels = [4 * base_channels]
+
+        if self.arch_mode == 'unet':
+            if num_stage == 3:
+                self.deconv1 = DeConv2dFuse(base_channels * 4,
+                                            base_channels * 2, 3)
+                self.deconv2 = DeConv2dFuse(base_channels * 2, base_channels,
+                                            3)
+
+                self.out2 = nn.Conv2d(
+                    base_channels * 2, base_channels * 2, 1, bias=False)
+                self.out3 = nn.Conv2d(
+                    base_channels, base_channels, 1, bias=False)
+                self.out_channels.append(2 * base_channels)
+                self.out_channels.append(base_channels)
+
+            elif num_stage == 2:
+                self.deconv1 = DeConv2dFuse(base_channels * 4,
+                                            base_channels * 2, 3)
+
+                self.out2 = nn.Conv2d(
+                    base_channels * 2, base_channels * 2, 1, bias=False)
+                self.out_channels.append(2 * base_channels)
+        elif self.arch_mode == 'fpn':
+            final_chs = base_channels * 4
+            if num_stage == 3:
+                self.inner1 = nn.Conv2d(
+                    base_channels * 2, final_chs, 1, bias=True)
+                self.inner2 = nn.Conv2d(
+                    base_channels * 1, final_chs, 1, bias=True)
+
+                self.out2 = nn.Conv2d(
+                    final_chs, base_channels * 2, 3, padding=1, bias=False)
+                self.out3 = nn.Conv2d(
+                    final_chs, base_channels, 3, padding=1, bias=False)
+                self.out_channels.append(base_channels * 2)
+                self.out_channels.append(base_channels)
+
+            elif num_stage == 2:
+                self.inner1 = nn.Conv2d(
+                    base_channels * 2, final_chs, 1, bias=True)
+
+                self.out2 = nn.Conv2d(
+                    final_chs, base_channels, 3, padding=1, bias=False)
+                self.out_channels.append(base_channels)
+
+    def forward(self, x):
+        conv0 = self.conv0(x)
+        conv1 = self.conv1(conv0)
+        conv2 = self.conv2(conv1)
+
+        intra_feat = conv2
+        outputs = {}
+        out = self.out1(intra_feat)
+        outputs['stage1'] = out
+        if self.arch_mode == 'unet':
+            if self.num_stage == 3:
+                intra_feat = self.deconv1(conv1, intra_feat)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+                intra_feat = self.deconv2(conv0, intra_feat)
+                out = self.out3(intra_feat)
+                outputs['stage3'] = out
+
+            elif self.num_stage == 2:
+                intra_feat = self.deconv1(conv1, intra_feat)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+        elif self.arch_mode == 'fpn':
+            if self.num_stage == 3:
+                intra_feat = F.interpolate(
+                    intra_feat, scale_factor=2,
+                    mode='nearest') + self.inner1(conv1)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+                intra_feat = F.interpolate(
+                    intra_feat, scale_factor=2,
+                    mode='nearest') + self.inner2(conv0)
+                out = self.out3(intra_feat)
+                outputs['stage3'] = out
+
+            elif self.num_stage == 2:
+                intra_feat = F.interpolate(
+                    intra_feat, scale_factor=2,
+                    mode='nearest') + self.inner1(conv1)
+                out = self.out2(intra_feat)
+                outputs['stage2'] = out
+
+        return outputs
+
+
+class CostRegNet(nn.Module):
+
+    def __init__(self, in_channels, base_channels):
+        super(CostRegNet, self).__init__()
+        self.conv0 = Conv3d(in_channels, base_channels, padding=1)
+
+        self.conv1 = Conv3d(
+            base_channels, base_channels * 2, stride=2, padding=1)
+        self.conv2 = Conv3d(base_channels * 2, base_channels * 2, padding=1)
+
+        self.conv3 = Conv3d(
+            base_channels * 2, base_channels * 4, stride=2, padding=1)
+        self.conv4 = Conv3d(base_channels * 4, base_channels * 4, padding=1)
+
+        self.conv5 = Conv3d(
+            base_channels * 4, base_channels * 8, stride=2, padding=1)
+        self.conv6 = Conv3d(base_channels * 8, base_channels * 8, padding=1)
+
+        self.conv7 = Deconv3d(
+            base_channels * 8,
+            base_channels * 4,
+            stride=2,
+            padding=1,
+            output_padding=1)
+
+        self.conv9 = Deconv3d(
+            base_channels * 4,
+            base_channels * 2,
+            stride=2,
+            padding=1,
+            output_padding=1)
+
+        self.conv11 = Deconv3d(
+            base_channels * 2,
+            base_channels * 1,
+            stride=2,
+            padding=1,
+            output_padding=1)
+
+        self.prob = nn.Conv3d(
+            base_channels, 1, 3, stride=1, padding=1, bias=False)
+
+    def forward(self, x):
+        conv0 = self.conv0(x)
+        conv2 = self.conv2(self.conv1(conv0))
+        conv4 = self.conv4(self.conv3(conv2))
+        x = self.conv6(self.conv5(conv4))
+        x = conv4 + self.conv7(x)
+        x = conv2 + self.conv9(x)
+        x = conv0 + self.conv11(x)
+        x = self.prob(x)
+        return x
+
+
+class RefineNet(nn.Module):
+
+    def __init__(self):
+        super(RefineNet, self).__init__()
+        self.conv1 = ConvBnReLU(4, 32)
+        self.conv2 = ConvBnReLU(32, 32)
+        self.conv3 = ConvBnReLU(32, 32)
+        self.res = ConvBnReLU(32, 1)
+
+    def forward(self, img, depth_init):
+        concat = F.cat((img, depth_init), dim=1)
+        depth_residual = self.res(self.conv3(self.conv2(self.conv1(concat))))
+        depth_refined = depth_init + depth_residual
+        return depth_refined
+
+
+def depth_regression(p, depth_values):
+    if depth_values.dim() <= 2:
+        depth_values = depth_values.view(*depth_values.shape, 1, 1)
+    depth = torch.sum(p * depth_values, 1)
+
+    return depth
+
+
+def cas_mvsnet_loss(inputs, depth_gt_ms, mask_ms, **kwargs):
+    depth_loss_weights = kwargs.get('dlossw', None)
+
+    total_loss = torch.tensor(
+        0.0,
+        dtype=torch.float32,
+        device=mask_ms['stage1'].device,
+        requires_grad=False)
+
+    for (stage_inputs, stage_key) in [(inputs[k], k) for k in inputs.keys()
+                                      if 'stage' in k]:
+        depth_est = stage_inputs['depth']
+        depth_gt = depth_gt_ms[stage_key]
+        mask = mask_ms[stage_key]
+        mask = mask > 0.5
+
+        depth_loss = F.smooth_l1_loss(
+            depth_est[mask], depth_gt[mask], reduction='mean')
+
+        if depth_loss_weights is not None:
+            stage_idx = int(stage_key.replace('stage', '')) - 1
+            total_loss += depth_loss_weights[stage_idx] * depth_loss
+        else:
+            total_loss += 1.0 * depth_loss
+
+    return total_loss, depth_loss
+
+
+def get_cur_depth_range_samples(cur_depth,
+                                ndepth,
+                                depth_inteval_pixel,
+                                shape,
+                                max_depth=192.0,
+                                min_depth=0.0):
+    """
+        shape, (B, H, W)
+        cur_depth: (B, H, W)
+        return depth_range_values: (B, D, H, W)
+    """
+    cur_depth_min = (cur_depth - ndepth / 2 * depth_inteval_pixel)  # (B, H, W)
+    cur_depth_max = (cur_depth + ndepth / 2 * depth_inteval_pixel)
+
+    assert cur_depth.shape == torch.Size(
+        shape), 'cur_depth:{}, input shape:{}'.format(cur_depth.shape, shape)
+    new_interval = (cur_depth_max - cur_depth_min) / (ndepth - 1)  # (B, H, W)
+
+    depth_range_samples = cur_depth_min.unsqueeze(1) + (
+        torch.arange(
+            0,
+            ndepth,
+            device=cur_depth.device,
+            dtype=cur_depth.dtype,
+            requires_grad=False).reshape(1, -1, 1, 1)
+        * new_interval.unsqueeze(1))
+
+    return depth_range_samples
+
+
+def get_depth_range_samples(cur_depth,
+                            ndepth,
+                            depth_inteval_pixel,
+                            device,
+                            dtype,
+                            shape,
+                            max_depth=192.0,
+                            min_depth=0.0):
+    """
+        shape: (B, H, W)
+        cur_depth: (B, H, W) or (B, D)
+        return depth_range_samples: (B, D, H, W)
+    """
+    if cur_depth.dim() == 2:
+        cur_depth_min = cur_depth[:, 0]  # (B,)
+        cur_depth_max = cur_depth[:, -1]
+        new_interval = (cur_depth_max - cur_depth_min) / (ndepth - 1)  # (B, )
+
+        depth_range_samples = cur_depth_min.unsqueeze(1) + (torch.arange(
+            0, ndepth, device=device, dtype=dtype,
+            requires_grad=False).reshape(1, -1) * new_interval.unsqueeze(1)
+                                                            )  # noqa  # (B, D)
+
+        depth_range_samples = depth_range_samples.unsqueeze(-1).unsqueeze(
+            -1).repeat(1, 1, shape[1], shape[2])  # (B, D, H, W)
+
+    else:
+
+        depth_range_samples = get_cur_depth_range_samples(
+            cur_depth, ndepth, depth_inteval_pixel, shape, max_depth,
+            min_depth)
+
+    return depth_range_samples
diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/utils.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/utils.py
new file mode 100644
index 000000000..aeab02b36
--- /dev/null
+++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/utils.py
@@ -0,0 +1,118 @@
+# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch
+import random
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+import torchvision.utils as vutils
+
+
+# convert a function into recursive style to handle nested dict/list/tuple variables
+def make_recursive_func(func):
+
+    def wrapper(vars):
+        if isinstance(vars, list):
+            return [wrapper(x) for x in vars]
+        elif isinstance(vars, tuple):
+            return tuple([wrapper(x) for x in vars])
+        elif isinstance(vars, dict):
+            return {k: wrapper(v) for k, v in vars.items()}
+        else:
+            return func(vars)
+
+    return wrapper
+
+
+@make_recursive_func
+def tensor2numpy(vars):
+    if isinstance(vars, np.ndarray):
+        return vars
+    elif isinstance(vars, torch.Tensor):
+        return vars.detach().cpu().numpy().copy()
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2numpy'.format(type(vars)))
+
+
+@make_recursive_func
+def numpy2torch(vars):
+    if isinstance(vars, np.ndarray):
+        return torch.from_numpy(vars)
+    elif isinstance(vars, torch.Tensor):
+        return vars
+    elif isinstance(vars, str):
+        return vars
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for numpy2torch'.format(type(vars)))
+
+
+@make_recursive_func
+def tocuda(vars):
+    if isinstance(vars, torch.Tensor):
+        return vars.to(torch.device('cuda'))
+    elif isinstance(vars, str):
+        return vars
+    else:
+        raise NotImplementedError(
+            'invalid input type {} for tensor2numpy'.format(type(vars)))
+
+
+def generate_pointcloud(rgb, depth, ply_file, intr, scale=1.0):
+    """
+    Generate a colored point cloud in PLY format from a color and a depth image.
+
+    Input:
+    rgb_file -- filename of color image
+    depth_file -- filename of depth image
+    ply_file -- filename of ply file
+
+    """
+    fx, fy, cx, cy = intr[0, 0], intr[1, 1], intr[0, 2], intr[1, 2]
+    points = []
+    for v in range(rgb.shape[0]):
+        for u in range(rgb.shape[1]):
+            color = rgb[v, u]  # rgb.getpixel((u, v))
+            Z = depth[v, u] / scale
+            if Z == 0:
+                continue
+            X = (u - cx) * Z / fx
+            Y = (v - cy) * Z / fy
+            points.append('%f %f %f %d %d %d 0\n' %
+                          (X, Y, Z, color[0], color[1], color[2]))
+    file = open(ply_file, 'w')
+    file.write('''ply
+            format ascii 1.0
+            element vertex %d
+            property float x
+            property float y
+            property float z
+            property uchar red
+            property uchar green
+            property uchar blue
+            property uchar alpha
+            end_header
+            %s
+            ''' % (len(points), ''.join(points)))
+    file.close()
+
+
+def write_cam(file, cam):
+    f = open(file, 'w')
+    f.write('extrinsic\n')
+    for i in range(0, 4):
+        for j in range(0, 4):
+            f.write(str(cam[0][i][j]) + ' ')
+        f.write('\n')
+    f.write('\n')
+
+    f.write('intrinsic\n')
+    for i in range(0, 3):
+        for j in range(0, 3):
+            f.write(str(cam[1][i][j]) + ' ')
+        f.write('\n')
+
+    f.write('\n' + str(cam[1][3][0]) + ' ' + str(cam[1][3][1]) + ' '
+            + str(cam[1][3][2]) + ' ' + str(cam[1][3][3]) + '\n')
+
+    f.close()
diff --git a/modelscope/models/cv/image_normal_estimation/__init__.py b/modelscope/models/cv/image_normal_estimation/__init__.py
new file mode 100644
index 000000000..9551a3842
--- /dev/null
+++ b/modelscope/models/cv/image_normal_estimation/__init__.py
@@ -0,0 +1,22 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .omnidata_model import OmnidataNormalEstimation
+
+else:
+    _import_structure = {
+        'omnidata_model': ['OmnidataNormalEstimation'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/image_normal_estimation/modules/__init__.py b/modelscope/models/cv/image_normal_estimation/modules/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/__init__.py b/modelscope/models/cv/image_normal_estimation/modules/midas/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py
new file mode 100644
index 000000000..41564c78f
--- /dev/null
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py
@@ -0,0 +1,20 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import torch
+
+
+class BaseModel(torch.nn.Module):
+
+    def load(self, path):
+        """Load model from file.
+
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+
+        if 'optimizer' in parameters:
+            parameters = parameters['model']
+
+        self.load_state_dict(parameters)
diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py
new file mode 100644
index 000000000..e0a30733a
--- /dev/null
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py
@@ -0,0 +1,395 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import torch
+import torch.nn as nn
+
+from .vit import (_make_pretrained_vitb16_384, _make_pretrained_vitb_rn50_384,
+                  _make_pretrained_vitl16_384, forward_vit)
+
+
+def _make_encoder(
+    backbone,
+    features,
+    use_pretrained,
+    groups=1,
+    expand=False,
+    exportable=True,
+    hooks=None,
+    use_vit_only=False,
+    use_readout='ignore',
+):
+    if backbone == 'vitl16_384':
+        pretrained = _make_pretrained_vitl16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout)
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups,
+            expand=expand)  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == 'vitb_rn50_384':
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups,
+            expand=expand)  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == 'vitb16_384':
+        pretrained = _make_pretrained_vitb16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout)
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups,
+            expand=expand)  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == 'resnext101_wsl':
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch([256, 512, 1024, 2048],
+                                features,
+                                groups=groups,
+                                expand=expand)  # efficientnet_lite3
+    elif backbone == 'efficientnet_lite3':
+        pretrained = _make_pretrained_efficientnet_lite3(
+            use_pretrained, exportable=exportable)
+        scratch = _make_scratch([32, 48, 136, 384],
+                                features,
+                                groups=groups,
+                                expand=expand)  # efficientnet_lite3
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+
+    return pretrained, scratch
+
+
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand is True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups)
+
+    return scratch
+
+
+def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
+    efficientnet = torch.hub.load(
+        'rwightman/gen-efficientnet-pytorch',
+        'tf_efficientnet_lite3',
+        pretrained=use_pretrained,
+        exportable=exportable)
+    return _make_efficientnet_backbone(efficientnet)
+
+
+def _make_efficientnet_backbone(effnet):
+    pretrained = nn.Module()
+
+    pretrained.layer1 = nn.Sequential(effnet.conv_stem, effnet.bn1,
+                                      effnet.act1, *effnet.blocks[0:2])
+    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
+    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
+    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
+
+    return pretrained
+
+
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu,
+                                      resnet.maxpool, resnet.layer1)
+
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+
+    return pretrained
+
+
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load('facebookresearch/WSL-Images',
+                            'resnext101_32x8d_wsl')
+    return _make_resnet_backbone(resnet)
+
+
+class Interpolate(nn.Module):
+    """Interpolation module.
+    """
+
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: interpolated data
+        """
+
+        x = self.interp(
+            x,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners)
+
+        return x
+
+
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True)
+
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+
+        return out + x
+
+
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+
+    def __init__(self, features):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode='bilinear', align_corners=True)
+
+        return output
+
+
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module.
+    """
+
+    def __init__(self, features, activation, bn):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+
+        self.bn = bn
+
+        self.groups = 1
+
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+            groups=self.groups)
+
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=True,
+            groups=self.groups)
+
+        if self.bn is True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+
+        self.activation = activation
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, x):
+        """Forward pass.
+
+        Args:
+            x (tensor): input
+
+        Returns:
+            tensor: output
+        """
+
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn is True:
+            out = self.bn1(out)
+
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn is True:
+            out = self.bn2(out)
+
+        if self.groups > 1:
+            out = self.conv_merge(out)
+
+        return self.skip_add.add(out, x)
+
+        # return out + x
+
+
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block.
+    """
+
+    def __init__(self,
+                 features,
+                 activation,
+                 deconv=False,
+                 bn=False,
+                 expand=False,
+                 align_corners=True):
+        """Init.
+
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+
+        self.deconv = deconv
+        self.align_corners = align_corners
+
+        self.groups = 1
+
+        self.expand = expand
+        out_features = features
+        if self.expand is True:
+            out_features = features // 2
+
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1)
+
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+
+        self.skip_add = nn.quantized.FloatFunctional()
+
+    def forward(self, *xs):
+        """Forward pass.
+
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+
+        output = self.resConfUnit2(output)
+
+        output = nn.functional.interpolate(
+            output,
+            scale_factor=2,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        output = self.out_conv(output)
+
+        return output
diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py
new file mode 100644
index 000000000..af7993278
--- /dev/null
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py
@@ -0,0 +1,108 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .base_model import BaseModel
+from .blocks import (FeatureFusionBlock, FeatureFusionBlock_custom,
+                     Interpolate, _make_encoder, forward_vit)
+
+
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+
+
+class DPT(BaseModel):
+
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone='vitb_rn50_384',
+        readout='project',
+        channels_last=False,
+        use_bn=False,
+    ):
+
+        super(DPT, self).__init__()
+
+        self.channels_last = channels_last
+
+        hooks = {
+            'vitb_rn50_384': [0, 1, 8, 11],
+            'vitb16_384': [2, 5, 8, 11],
+            'vitl16_384': [5, 11, 17, 23],
+        }
+
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            False,  # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+
+        self.scratch.output_conv = head
+
+    def forward(self, x):
+        if self.channels_last is True:
+            x.contiguous(memory_format=torch.channels_last)
+
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+
+        out = self.scratch.output_conv(path_1)
+
+        return out
+
+
+class DPTDepthModel(DPT):
+
+    def __init__(self, path=None, non_negative=True, num_channels=1, **kwargs):
+        features = kwargs['features'] if 'features' in kwargs else 256
+
+        head = nn.Sequential(
+            nn.Conv2d(
+                features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode='bilinear', align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, num_channels, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+
+        super().__init__(head, **kwargs)
+
+        if path is not None:
+            self.load(path)
+
+    def forward(self, x):
+        return super().forward(x).squeeze(dim=1)
diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py
new file mode 100644
index 000000000..bb8ba9f31
--- /dev/null
+++ b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py
@@ -0,0 +1,517 @@
+# This implementation is adopted from MiDaS
+# made publicly available under the MIT license
+# https://github.com/isl-org/MiDaS
+import math
+import types
+
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class Slice(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        return x[:, self.start_index:]
+
+
+class AddReadout(nn.Module):
+
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index:] + readout.unsqueeze(1)
+
+
+class ProjectReadout(nn.Module):
+
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+
+        self.project = nn.Sequential(
+            nn.Linear(2 * in_features, in_features), nn.GELU())
+
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:])
+        features = torch.cat((x[:, self.start_index:], readout), -1)
+
+        return self.project(features)
+
+
+class Transpose(nn.Module):
+
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+
+
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+
+    _ = pretrained.model.forward_flex(x)
+
+    layer_1 = pretrained.activations['1']
+    layer_2 = pretrained.activations['2']
+    layer_3 = pretrained.activations['3']
+    layer_4 = pretrained.activations['4']
+
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size([
+                h // pretrained.model.patch_size[1],
+                w // pretrained.model.patch_size[0],
+            ]),
+        ))
+
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+
+    layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)](
+        layer_1)
+    layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)](
+        layer_2)
+    layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)](
+        layer_3)
+    layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)](
+        layer_4)
+
+    return layer_1, layer_2, layer_3, layer_4
+
+
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, :self.start_index],
+        posemb[0, self.start_index:],
+    )
+
+    gs_old = int(math.sqrt(len(posemb_grid)))
+
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old,
+                                      -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(
+        posemb_grid, size=(gs_h, gs_w), mode='bilinear')
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+
+    return posemb
+
+
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+
+    pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1],
+                                       w // self.patch_size[0])
+
+    B = x.shape[0]
+
+    if hasattr(self.patch_embed, 'backbone'):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[
+                -1]  # last feature if backbone outputs list/tuple of features
+
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+
+    if getattr(self, 'dist_token', None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+
+    x = x + pos_embed
+    x = self.pos_drop(x)
+
+    for blk in self.blocks:
+        x = blk(x)
+
+    x = self.norm(x)
+
+    return x
+
+
+activations = {}
+
+
+def get_activation(name):
+
+    def hook(model, input, output):
+        activations[name] = output
+
+    return hook
+
+
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == 'ignore':
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == 'add':
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == 'project':
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+
+    return readout_oper
+
+
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout='ignore',
+    start_index=1,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(
+        get_activation('1'))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(
+        get_activation('2'))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
+
+
+def _make_pretrained_vitl16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_large_patch16_384', pretrained=pretrained)
+
+    hooks = [5, 11, 17, 23] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+    )
+
+
+def _make_pretrained_vitb16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model('vit_base_patch16_384', pretrained=pretrained)
+
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout)
+
+
+def _make_pretrained_deitb16_384(pretrained, use_readout='ignore', hooks=None):
+    model = timm.create_model(
+        'vit_deit_base_patch16_384', pretrained=pretrained)
+
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout)
+
+
+def _make_pretrained_deitb16_distil_384(pretrained,
+                                        use_readout='ignore',
+                                        hooks=None):
+    model = timm.create_model(
+        'vit_deit_base_distilled_patch16_384', pretrained=pretrained)
+
+    hooks = [2, 5, 8, 11] if hooks is None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+    )
+
+
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout='ignore',
+    start_index=1,
+):
+    pretrained = nn.Module()
+
+    pretrained.model = model
+
+    if use_vit_only:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(
+            get_activation('1'))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(
+            get_activation('2'))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation('1'))
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation('2'))
+
+    pretrained.model.blocks[hooks[2]].register_forward_hook(
+        get_activation('3'))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(
+        get_activation('4'))
+
+    pretrained.activations = activations
+
+    readout_oper = get_readout_oper(vit_features, features, use_readout,
+                                    start_index)
+
+    if use_vit_only:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(nn.Identity(),
+                                                    nn.Identity(),
+                                                    nn.Identity())
+        pretrained.act_postprocess2 = nn.Sequential(nn.Identity(),
+                                                    nn.Identity(),
+                                                    nn.Identity())
+
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex,
+                                                     pretrained.model)
+
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model)
+
+    return pretrained
+
+
+def _make_pretrained_vitb_rn50_384(pretrained,
+                                   use_readout='ignore',
+                                   hooks=None,
+                                   use_vit_only=False):
+    model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained)
+
+    hooks = [0, 1, 8, 11] if hooks is None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+    )
diff --git a/modelscope/models/cv/image_normal_estimation/omnidata_model.py b/modelscope/models/cv/image_normal_estimation/omnidata_model.py
new file mode 100644
index 000000000..35e89c1c8
--- /dev/null
+++ b/modelscope/models/cv/image_normal_estimation/omnidata_model.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Model: Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets from 3D Scans
+# Paper link: https://arxiv.org/pdf/2110.04994.pdf
+import os.path as osp
+
+import torch
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.image_normal_estimation.modules.midas.dpt_depth import \
+    DPTDepthModel
+from modelscope.outputs import OutputKeys
+from modelscope.utils.constant import ModelFile, Tasks
+
+
+@MODELS.register_module(
+    Tasks.image_normal_estimation,
+    module_name=Models.omnidata_normal_estimation)
+class OmnidataNormalEstimation(TorchModel):
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        # build model
+        self.model = DPTDepthModel(
+            backbone='vitb_rn50_384', num_channels=3)  # DPT Hybrid
+        # checkpoint = torch.load(pretrained_weights_path, map_location=map_location)
+
+        # load model
+        model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE)
+        checkpoint = torch.load(model_path, map_location='cpu')
+        if 'state_dict' in checkpoint:
+            state_dict = {}
+            for k, v in checkpoint['state_dict'].items():
+                state_dict[k[6:]] = v
+        else:
+            state_dict = checkpoint
+        self.model.load_state_dict(state_dict)
+        self.model.eval()
+
+    def forward(self, inputs):
+        return self.model(inputs['imgs']).clamp(min=0, max=1)
+
+    def postprocess(self, inputs):
+        normal_result = inputs.flip(1)
+        results = {OutputKeys.NORMALS: normal_result}
+        return results
+
+    def inference(self, data):
+        results = self.forward(data)
+
+        return results
diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
index 24451e96c..8bbc80589 100755
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
+# The implementation is adopted from Pytorch_Retinaface, made publicly available under the MIT License
 # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py
 import time
 
diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
index 64d959713..8f39db786 100755
--- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
+++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License
+# The implementation is adopted from Pytorch_Retinaface, made publicly available under the MIT License
 # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py
 from collections import OrderedDict
 
diff --git a/modelscope/models/cv/image_probing_model/backbone.py b/modelscope/models/cv/image_probing_model/backbone.py
index 8f3ed5b6f..64fb37b3c 100644
--- a/modelscope/models/cv/image_probing_model/backbone.py
+++ b/modelscope/models/cv/image_probing_model/backbone.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from OpenAI-CLIP,
-# made pubicly available under the MIT License at https://github.com/openai/CLIP
+# made publicly available under the MIT License at https://github.com/openai/CLIP
 
 import math
 import sys
diff --git a/modelscope/models/cv/image_quality_assessment_man/maniqa.py b/modelscope/models/cv/image_quality_assessment_man/maniqa.py
index 8c9243096..eb037941b 100644
--- a/modelscope/models/cv/image_quality_assessment_man/maniqa.py
+++ b/modelscope/models/cv/image_quality_assessment_man/maniqa.py
@@ -1,4 +1,4 @@
-# This implementation is adopted from MANIQA, made pubicly available under the Apache License 2.0 at
+# This implementation is adopted from MANIQA, made publicly available under the Apache License 2.0 at
 # https://github.com/IIGROUP/MANIQA/blob/master/models/maniqa.py
 
 import timm
diff --git a/modelscope/models/cv/image_quality_assessment_man/swin.py b/modelscope/models/cv/image_quality_assessment_man/swin.py
index df58277f2..e77488c04 100644
--- a/modelscope/models/cv/image_quality_assessment_man/swin.py
+++ b/modelscope/models/cv/image_quality_assessment_man/swin.py
@@ -1,4 +1,4 @@
-# This implementation is adopted form SwinTransformer, made pubicly available under the MIT License at
+# This implementation is adopted form SwinTransformer, made publicly available under the MIT License at
 # https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py
 
 import collections.abc
diff --git a/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py b/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py
index e153e5f96..9282005ec 100644
--- a/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py
+++ b/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from CenseoQoE, made pubicly available under the MIT License at
+# The implementation is adopted from CenseoQoE, made publicly available under the MIT License at
 # https://github.com/Tencent/CenseoQoE
 import os
 
diff --git a/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py b/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py
index fbe40e6ae..f5710bc5a 100644
--- a/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py
+++ b/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from CenseoQoE, made pubicly available under the MIT License at
+# The implementation is adopted from CenseoQoE, made publicly available under the MIT License at
 # https://github.com/Tencent/CenseoQoE
 
 import torch
diff --git a/modelscope/models/cv/image_reid_person/pass_model.py b/modelscope/models/cv/image_reid_person/pass_model.py
index 3b032949d..87c43340d 100644
--- a/modelscope/models/cv/image_reid_person/pass_model.py
+++ b/modelscope/models/cv/image_reid_person/pass_model.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
+# The implementation is adopted from PASS-reID, made publicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import os
diff --git a/modelscope/models/cv/image_reid_person/transreid_model.py b/modelscope/models/cv/image_reid_person/transreid_model.py
index 5bceb4685..924c58973 100644
--- a/modelscope/models/cv/image_reid_person/transreid_model.py
+++ b/modelscope/models/cv/image_reid_person/transreid_model.py
@@ -1,4 +1,4 @@
-# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at
+# The implementation is adopted from PASS-reID, made publicly available under the Apache-2.0 License at
 # https://github.com/CASIA-IVA-Lab/PASS-reID
 
 import collections.abc as container_abcs
diff --git a/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py b/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py
index 33de31e6f..414eae89f 100644
--- a/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py
+++ b/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py
@@ -17,11 +17,11 @@
 import torch
 import torch.nn.functional as F
 from diffusers.models.activations import get_activation
-from diffusers.models.attention import AdaGroupNorm
 from diffusers.models.attention_processor import (Attention,
                                                   AttnAddedKVProcessor,
                                                   AttnAddedKVProcessor2_0)
 from diffusers.models.dual_transformer_2d import DualTransformer2DModel
+from diffusers.models.normalization import AdaLayerNorm
 from diffusers.models.resnet import (Downsample2D, FirDownsample2D,
                                      FirUpsample2D, KDownsample2D, KUpsample2D,
                                      ResnetBlock2D, Upsample2D)
diff --git a/modelscope/models/cv/image_to_3d/__init__.py b/modelscope/models/cv/image_to_3d/__init__.py
new file mode 100644
index 000000000..44c424281
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/__init__.py
@@ -0,0 +1,2 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+from . import ldm
diff --git a/modelscope/models/cv/image_to_3d/ldm/base_utils.py b/modelscope/models/cv/image_to_3d/ldm/base_utils.py
new file mode 100644
index 000000000..3362fa18f
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/base_utils.py
@@ -0,0 +1,211 @@
+import pickle
+
+import cv2
+import numpy as np
+from skimage.io import imread
+
+
+def save_pickle(data, pkl_path):
+    # os.system('mkdir -p {}'.format(os.path.dirname(pkl_path)))
+    with open(pkl_path, 'wb') as f:
+        pickle.dump(data, f)
+
+
+def read_pickle(pkl_path):
+    with open(pkl_path, 'rb') as f:
+        return pickle.load(f)
+
+
+def draw_epipolar_line(F, img0, img1, pt0, color):
+    h1, w1 = img1.shape[:2]
+    hpt = np.asarray([pt0[0], pt0[1], 1], dtype=np.float32)[:, None]
+    ln = F @ hpt
+    ln = ln[:, 0]
+    a, b, c = ln[0], ln[1], ln[2]
+    pt1 = np.asarray([0, -c / b]).astype(np.int32)
+    pt2 = np.asarray([w1, (-a * w1 - c) / b]).astype(np.int32)
+
+    img0 = cv2.circle(img0, tuple(pt0.astype(np.int32)), 5, color, 2)
+    img1 = cv2.line(img1, tuple(pt1), tuple(pt2), color, 2)
+    return img0, img1
+
+
+def draw_epipolar_lines(F, img0, img1, num=20):
+    img0, img1 = img0.copy(), img1.copy()
+    h0, w0, _ = img0.shape
+    h1, w1, _ = img1.shape
+
+    for k in range(num):
+        color = np.random.randint(0, 255, [3], dtype=np.int32)
+        color = [int(c) for c in color]
+        pt = np.random.uniform(0, 1, 2)
+        pt[0] *= w0
+        pt[1] *= h0
+        pt = pt.astype(np.int32)
+        img0, img1 = draw_epipolar_line(F, img0, img1, pt, color)
+
+    return img0, img1
+
+
+def compute_F(K1, K2, Rt0, Rt1=None):
+    if Rt1 is None:
+        R, t = Rt0[:, :3], Rt0[:, 3:]
+    else:
+        Rt = compute_dR_dt(Rt0, Rt1)
+        R, t = Rt[:, :3], Rt[:, 3:]
+    A = K1 @ R.T @ t  # [3,1]
+    C = np.asarray([[0, -A[2, 0], A[1, 0]], [A[2, 0], 0, -A[0, 0]],
+                    [-A[1, 0], A[0, 0], 0]])
+    F = (np.linalg.inv(K2)).T @ R @ K1.T @ C
+    return F
+
+
+def compute_dR_dt(Rt0, Rt1):
+    R0, t0 = Rt0[:, :3], Rt0[:, 3:]
+    R1, t1 = Rt1[:, :3], Rt1[:, 3:]
+    dR = np.dot(R1, R0.T)
+    dt = t1 - np.dot(dR, t0)
+    return np.concatenate([dR, dt], -1)
+
+
+def concat_images(img0, img1, vert=False):
+    if not vert:
+        h0, h1 = img0.shape[0], img1.shape[0],
+        if h0 < h1:
+            img0 = cv2.copyMakeBorder(
+                img0,
+                0,
+                h1 - h0,
+                0,
+                0,
+                borderType=cv2.BORDER_CONSTANT,
+                value=0)
+        if h1 < h0:
+            img1 = cv2.copyMakeBorder(
+                img1,
+                0,
+                h0 - h1,
+                0,
+                0,
+                borderType=cv2.BORDER_CONSTANT,
+                value=0)
+        img = np.concatenate([img0, img1], axis=1)
+    else:
+        w0, w1 = img0.shape[1], img1.shape[1]
+        if w0 < w1:
+            img0 = cv2.copyMakeBorder(
+                img0,
+                0,
+                0,
+                0,
+                w1 - w0,
+                borderType=cv2.BORDER_CONSTANT,
+                value=0)
+        if w1 < w0:
+            img1 = cv2.copyMakeBorder(
+                img1,
+                0,
+                0,
+                0,
+                w0 - w1,
+                borderType=cv2.BORDER_CONSTANT,
+                value=0)
+        img = np.concatenate([img0, img1], axis=0)
+
+    return img
+
+
+def concat_images_list(*args, vert=False):
+    if len(args) == 1:
+        return args[0]
+    img_out = args[0]
+    for img in args[1:]:
+        img_out = concat_images(img_out, img, vert)
+    return img_out
+
+
+def pose_inverse(pose):
+    R = pose[:, :3].T
+    t = -R @ pose[:, 3:]
+    return np.concatenate([R, t], -1)
+
+
+def project_points(pts, RT, K):
+    pts = np.matmul(pts, RT[:, :3].transpose()) + RT[:, 3:].transpose()
+    pts = np.matmul(pts, K.transpose())
+    dpt = pts[:, 2]
+    mask0 = (np.abs(dpt) < 1e-4) & (np.abs(dpt) > 0)
+    if np.sum(mask0) > 0:
+        dpt[mask0] = 1e-4
+    mask1 = (np.abs(dpt) > -1e-4) & (np.abs(dpt) < 0)
+    if np.sum(mask1) > 0:
+        dpt[mask1] = -1e-4
+    pts2d = pts[:, :2] / dpt[:, None]
+    return pts2d, dpt
+
+
+def draw_keypoints(img, kps, colors=None, radius=2):
+    out_img = img.copy()
+    for pi, pt in enumerate(kps):
+        pt = np.round(pt).astype(np.int32)
+        if colors is not None:
+            color = [int(c) for c in colors[pi]]
+            cv2.circle(out_img, tuple(pt), radius, color, -1)
+        else:
+            cv2.circle(out_img, tuple(pt), radius, (0, 255, 0), -1)
+    return out_img
+
+
+def output_points(fn, pts, colors=None):
+    with open(fn, 'w') as f:
+        for pi, pt in enumerate(pts):
+            f.write(f'{pt[0]:.6f} {pt[1]:.6f} {pt[2]:.6f} ')
+            if colors is not None:
+                f.write(
+                    f'{int(colors[pi,0])} {int(colors[pi,1])} {int(colors[pi,2])}'
+                )
+            f.write('\n')
+
+
+DEPTH_MAX, DEPTH_MIN = 2.4, 0.6
+DEPTH_VALID_MAX, DEPTH_VALID_MIN = 2.37, 0.63
+
+
+def read_depth_objaverse(depth_fn):
+    depth = imread(depth_fn)
+    depth = depth.astype(
+        np.float32) / 65535 * (DEPTH_MAX - DEPTH_MIN) + DEPTH_MIN
+    mask = (depth > DEPTH_VALID_MIN) & (depth < DEPTH_VALID_MAX)
+    return depth, mask
+
+
+def mask_depth_to_pts(mask, depth, K, rgb=None):
+    hs, ws = np.nonzero(mask)
+    depth = depth[hs, ws]
+    pts = np.asarray([ws, hs, depth], np.float32).transpose()
+    pts[:, :2] *= pts[:, 2:]
+    if rgb is not None:
+        return np.dot(pts, np.linalg.inv(K).transpose()), rgb[hs, ws]
+    else:
+        return np.dot(pts, np.linalg.inv(K).transpose())
+
+
+def transform_points_pose(pts, pose):
+    R, t = pose[:, :3], pose[:, 3]
+    if len(pts.shape) == 1:
+        return (R @ pts[:, None] + t[:, None])[:, 0]
+    return pts @ R.T + t[None, :]
+
+
+def pose_apply(pose, pts):
+    return transform_points_pose(pts, pose)
+
+
+def downsample_gaussian_blur(img, ratio):
+    sigma = (1 / ratio) / 3
+    # ksize=np.ceil(2*sigma)
+    ksize = int(np.ceil(((sigma - 0.8) / 0.3 + 1) * 2 + 1))
+    ksize = ksize + 1 if ksize % 2 == 0 else ksize
+    img = cv2.GaussianBlur(
+        img, (ksize, ksize), sigma, borderType=cv2.BORDER_REFLECT101)
+    return img
diff --git a/modelscope/models/cv/image_to_3d/ldm/models/autoencoder.py b/modelscope/models/cv/image_to_3d/ldm/models/autoencoder.py
new file mode 100644
index 000000000..6d5a538e1
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/models/autoencoder.py
@@ -0,0 +1,558 @@
+from contextlib import contextmanager
+
+import pytorch_lightning as pl
+import torch
+import torch.nn.functional as F
+from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer
+
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.model import (
+    Decoder, Encoder)
+from modelscope.models.cv.image_to_3d.ldm.modules.distributions.distributions import \
+    DiagonalGaussianDistribution
+from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config
+
+
+class VQModel(pl.LightningModule):
+
+    def __init__(
+            self,
+            ddconfig,
+            lossconfig,
+            n_embed,
+            embed_dim,
+            ckpt_path=None,
+            ignore_keys=[],
+            image_key='image',
+            colorize_nlabels=None,
+            monitor=None,
+            batch_resize_range=None,
+            scheduler_config=None,
+            lr_g_factor=1.0,
+            remap=None,
+            sane_index_shape=False,  # tell vector quantizer to return indices as bhw
+            use_ema=False):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.n_embed = n_embed
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        self.quantize = VectorQuantizer(
+            n_embed,
+            embed_dim,
+            beta=0.25,
+            remap=remap,
+            sane_index_shape=sane_index_shape)
+        self.quant_conv = torch.nn.Conv2d(ddconfig['z_channels'], embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim,
+                                               ddconfig['z_channels'], 1)
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels) == int
+            self.register_buffer('colorize',
+                                 torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.batch_resize_range = batch_resize_range
+        if self.batch_resize_range is not None:
+            print(
+                f'{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.'
+            )
+
+        self.use_ema = use_ema
+        if self.use_ema:
+            self.model_ema = LitEma(self)
+            print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.')
+
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+        self.scheduler_config = scheduler_config
+        self.lr_g_factor = lr_g_factor
+
+    @contextmanager
+    def ema_scope(self, context=None):
+        if self.use_ema:
+            self.model_ema.store(self.parameters())
+            self.model_ema.copy_to(self)
+            if context is not None:
+                print(f'{context}: Switched to EMA weights')
+        try:
+            yield None
+        finally:
+            if self.use_ema:
+                self.model_ema.restore(self.parameters())
+                if context is not None:
+                    print(f'{context}: Restored training weights')
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location='cpu')['state_dict']
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print('Deleting key {} from state_dict.'.format(k))
+                    del sd[k]
+        missing, unexpected = self.load_state_dict(sd, strict=False)
+        print(
+            f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys'
+        )
+        if len(missing) > 0:
+            print(f'Missing Keys: {missing}')
+            print(f'Unexpected Keys: {unexpected}')
+
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        quant, emb_loss, info = self.quantize(h)
+        return quant, emb_loss, info
+
+    def encode_to_prequant(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+
+    def decode(self, quant):
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+    def decode_code(self, code_b):
+        quant_b = self.quantize.embed_code(code_b)
+        dec = self.decode(quant_b)
+        return dec
+
+    def forward(self, input, return_pred_indices=False):
+        quant, diff, (_, _, ind) = self.encode(input)
+        dec = self.decode(quant)
+        if return_pred_indices:
+            return dec, diff, ind
+        return dec, diff
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1,
+                      2).to(memory_format=torch.contiguous_format).float()
+        if self.batch_resize_range is not None:
+            lower_size = self.batch_resize_range[0]
+            upper_size = self.batch_resize_range[1]
+            if self.global_step <= 4:
+                # do the first few batches with max size to avoid later oom
+                new_resize = upper_size
+            else:
+                new_resize = np.random.choice(
+                    np.arange(lower_size, upper_size + 16, 16))
+            if new_resize != x.shape[2]:
+                x = F.interpolate(x, size=new_resize, mode='bicubic')
+            x = x.detach()
+        return x
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        # https://github.com/pytorch/pytorch/issues/37142
+        # try not to fool the heuristics
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+
+        if optimizer_idx == 0:
+            # autoencode
+            aeloss, log_dict_ae = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train',
+                predicted_indices=ind)
+
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True)
+            return aeloss
+
+        if optimizer_idx == 1:
+            # discriminator
+            discloss, log_dict_disc = self.loss(
+                qloss,
+                x,
+                xrec,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train')
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=True)
+            return discloss
+
+    def validation_step(self, batch, batch_idx):
+        log_dict = self._validation_step(batch, batch_idx)
+        # with self.ema_scope():
+        #     log_dict_ema = self._validation_step(
+        #         batch, batch_idx, suffix='_ema')
+        return log_dict
+
+    def _validation_step(self, batch, batch_idx, suffix=''):
+        x = self.get_input(batch, self.image_key)
+        xrec, qloss, ind = self(x, return_pred_indices=True)
+        aeloss, log_dict_ae = self.loss(
+            qloss,
+            x,
+            xrec,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val' + suffix,
+            predicted_indices=ind)
+
+        discloss, log_dict_disc = self.loss(
+            qloss,
+            x,
+            xrec,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val' + suffix,
+            predicted_indices=ind)
+        rec_loss = log_dict_ae[f'val{suffix}/rec_loss']
+        self.log(
+            f'val{suffix}/rec_loss',
+            rec_loss,
+            prog_bar=True,
+            logger=True,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True)
+        self.log(
+            f'val{suffix}/aeloss',
+            aeloss,
+            prog_bar=True,
+            logger=True,
+            on_step=False,
+            on_epoch=True,
+            sync_dist=True)
+        if version.parse(pl.__version__) >= version.parse('1.4.0'):
+            del log_dict_ae[f'val{suffix}/rec_loss']
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+
+    def configure_optimizers(self):
+        lr_d = self.learning_rate
+        lr_g = self.lr_g_factor * self.learning_rate
+        print('lr_d', lr_d)
+        print('lr_g', lr_g)
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters()) + list(self.decoder.parameters())
+            + list(self.quantize.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr_g,
+            betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr_d, betas=(0.5, 0.9))
+
+        if self.scheduler_config is not None:
+            scheduler = instantiate_from_config(self.scheduler_config)
+
+            print('Setting up LambdaLR scheduler...')
+            scheduler = [
+                {
+                    'scheduler':
+                    LambdaLR(opt_ae, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+                {
+                    'scheduler':
+                    LambdaLR(opt_disc, lr_lambda=scheduler.schedule),
+                    'interval': 'step',
+                    'frequency': 1
+                },
+            ]
+            return [opt_ae, opt_disc], scheduler
+        return [opt_ae, opt_disc], []
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+    def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if only_inputs:
+            log['inputs'] = x
+            return log
+        xrec, _ = self(x)
+        if x.shape[1] > 3:
+            # colorize with random projection
+            assert xrec.shape[1] > 3
+            x = self.to_rgb(x)
+            xrec = self.to_rgb(xrec)
+        log['inputs'] = x
+        log['reconstructions'] = xrec
+        if plot_ema:
+            with self.ema_scope():
+                xrec_ema, _ = self(x)
+                if x.shape[1] > 3:
+                    xrec_ema = self.to_rgb(xrec_ema)
+                log['reconstructions_ema'] = xrec_ema
+        return log
+
+    def to_rgb(self, x):
+        assert self.image_key == 'segmentation'
+        if not hasattr(self, 'colorize'):
+            self.register_buffer('colorize',
+                                 torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
+        return x
+
+
+class VQModelInterface(VQModel):
+
+    def __init__(self, embed_dim, *args, **kwargs):
+        super().__init__(embed_dim=embed_dim, *args, **kwargs)
+        self.embed_dim = embed_dim
+
+    def encode(self, x):
+        h = self.encoder(x)
+        h = self.quant_conv(h)
+        return h
+
+    def decode(self, h, force_not_quantize=False):
+        # also go through quantization layer
+        if not force_not_quantize:
+            quant, emb_loss, info = self.quantize(h)
+        else:
+            quant = h
+        quant = self.post_quant_conv(quant)
+        dec = self.decoder(quant)
+        return dec
+
+
+class AutoencoderKL(pl.LightningModule):
+
+    def __init__(
+        self,
+        ddconfig,
+        lossconfig,
+        embed_dim,
+        ckpt_path=None,
+        ignore_keys=[],
+        image_key='image',
+        colorize_nlabels=None,
+        monitor=None,
+    ):
+        super().__init__()
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        self.loss = instantiate_from_config(lossconfig)
+        assert ddconfig['double_z']
+        self.quant_conv = torch.nn.Conv2d(2 * ddconfig['z_channels'],
+                                          2 * embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim,
+                                               ddconfig['z_channels'], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels) == int
+            self.register_buffer('colorize',
+                                 torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        if ckpt_path is not None:
+            self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
+
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location='cpu')['state_dict']
+        keys = list(sd.keys())
+        for k in keys:
+            for ik in ignore_keys:
+                if k.startswith(ik):
+                    print('Deleting key {} from state_dict.'.format(k))
+                    del sd[k]
+        self.load_state_dict(sd, strict=False)
+        print(f'Restored from {path}')
+
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+
+    def decode(self, z):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+        return dec
+
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1,
+                      2).to(memory_format=torch.contiguous_format).float()
+        return x
+
+    def training_step(self, batch, batch_idx, optimizer_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+
+        if optimizer_idx == 0:
+            # train encoder+decoder+logvar
+            aeloss, log_dict_ae = self.loss(
+                inputs,
+                reconstructions,
+                posterior,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train')
+            self.log(
+                'aeloss',
+                aeloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True)
+            self.log_dict(
+                log_dict_ae,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=False)
+            return aeloss
+
+        if optimizer_idx == 1:
+            # train the discriminator
+            discloss, log_dict_disc = self.loss(
+                inputs,
+                reconstructions,
+                posterior,
+                optimizer_idx,
+                self.global_step,
+                last_layer=self.get_last_layer(),
+                split='train')
+
+            self.log(
+                'discloss',
+                discloss,
+                prog_bar=True,
+                logger=True,
+                on_step=True,
+                on_epoch=True)
+            self.log_dict(
+                log_dict_disc,
+                prog_bar=False,
+                logger=True,
+                on_step=True,
+                on_epoch=False)
+            return discloss
+
+    def validation_step(self, batch, batch_idx):
+        inputs = self.get_input(batch, self.image_key)
+        reconstructions, posterior = self(inputs)
+        aeloss, log_dict_ae = self.loss(
+            inputs,
+            reconstructions,
+            posterior,
+            0,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val')
+
+        discloss, log_dict_disc = self.loss(
+            inputs,
+            reconstructions,
+            posterior,
+            1,
+            self.global_step,
+            last_layer=self.get_last_layer(),
+            split='val')
+
+        self.log('val/rec_loss', log_dict_ae['val/rec_loss'])
+        self.log_dict(log_dict_ae)
+        self.log_dict(log_dict_disc)
+        return self.log_dict
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        opt_ae = torch.optim.Adam(
+            list(self.encoder.parameters()) + list(self.decoder.parameters())
+            + list(self.quant_conv.parameters())
+            + list(self.post_quant_conv.parameters()),
+            lr=lr,
+            betas=(0.5, 0.9))
+        opt_disc = torch.optim.Adam(
+            self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9))
+        return [opt_ae, opt_disc], []
+
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log['samples'] = self.decode(torch.randn_like(posterior.sample()))
+            log['reconstructions'] = xrec
+        log['inputs'] = x
+        return log
+
+    def to_rgb(self, x):
+        assert self.image_key == 'segmentation'
+        if not hasattr(self, 'colorize'):
+            self.register_buffer('colorize',
+                                 torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2. * (x - x.min()) / (x.max() - x.min()) - 1.
+        return x
+
+
+class IdentityFirstStage(torch.nn.Module):
+
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface  # TODO: Should be true by default but check to not break older stuff
+        super().__init__()
+
+    def encode(self, x, *args, **kwargs):
+        return x
+
+    def decode(self, x, *args, **kwargs):
+        return x
+
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+
+    def forward(self, x, *args, **kwargs):
+        return x
diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/__init__.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer.py
new file mode 100644
index 000000000..9783ee5b3
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer.py
@@ -0,0 +1,973 @@
+from pathlib import Path
+
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from skimage.io import imsave
+from torch.optim.lr_scheduler import LambdaLR
+from tqdm import tqdm
+
+from modelscope.models.cv.image_to_3d.ldm.base_utils import (
+    concat_images_list, read_pickle)
+from modelscope.models.cv.image_to_3d.ldm.models.diffusion.sync_dreamer_network import (
+    FrustumTV3DNet, NoisyTargetViewEncoder, SpatialTime3DNet)
+from modelscope.models.cv.image_to_3d.ldm.models.diffusion.sync_dreamer_utils import (
+    create_target_volume, get_warp_coordinates)
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import (
+    make_ddim_timesteps, timestep_embedding)
+from modelscope.models.cv.image_to_3d.ldm.modules.encoders.modules import \
+    FrozenCLIPImageEmbedder
+from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+def disable_training_module(module: nn.Module):
+    module = module.eval()
+    module.train = disabled_train
+    for para in module.parameters():
+        para.requires_grad = False
+    return module
+
+
+def repeat_to_batch(tensor, B, VN):
+    t_shape = tensor.shape
+    ones = [1 for _ in range(len(t_shape) - 1)]
+    tensor_new = tensor.view(B, 1, *t_shape[1:]).repeat(1, VN, *ones).view(
+        B * VN, *t_shape[1:])
+    return tensor_new
+
+
+class UNetWrapper(nn.Module):
+
+    def __init__(self,
+                 diff_model_config,
+                 drop_conditions=False,
+                 drop_scheme='default',
+                 use_zero_123=True):
+        super().__init__()
+        self.diffusion_model = instantiate_from_config(diff_model_config)
+        self.drop_conditions = drop_conditions
+        self.drop_scheme = drop_scheme
+        self.use_zero_123 = use_zero_123
+
+    def drop(self, cond, mask):
+        shape = cond.shape
+        B = shape[0]
+        cond = mask.view(B, *[1 for _ in range(len(shape) - 1)]) * cond
+        return cond
+
+    def get_trainable_parameters(self):
+        return self.diffusion_model.get_trainable_parameters()
+
+    def get_drop_scheme(self, B, device):
+        if self.drop_scheme == 'default':
+            random = torch.rand(B, dtype=torch.float32, device=device)
+            drop_clip = (random > 0.15) & (random <= 0.2)
+            drop_volume = (random > 0.1) & (random <= 0.15)
+            drop_concat = (random > 0.05) & (random <= 0.1)
+            drop_all = random <= 0.05
+        else:
+            raise NotImplementedError
+        return drop_clip, drop_volume, drop_concat, drop_all
+
+    def forward(self,
+                x,
+                t,
+                clip_embed,
+                volume_feats,
+                x_concat,
+                is_train=False):
+        """
+
+        @param x:             B,4,H,W
+        @param t:             B,
+        @param clip_embed:    B,M,768
+        @param volume_feats:  B,C,D,H,W
+        @param x_concat:      B,C,H,W
+        @param is_train:
+        @return:
+        """
+        if self.drop_conditions and is_train:
+            B = x.shape[0]
+            drop_clip, drop_volume, drop_concat, drop_all = self.get_drop_scheme(
+                B, x.device)
+
+            clip_mask = 1.0 - (drop_clip | drop_all).float()
+            clip_embed = self.drop(clip_embed, clip_mask)
+
+            volume_mask = 1.0 - (drop_volume | drop_all).float()
+            for k, v in volume_feats.items():
+                volume_feats[k] = self.drop(v, mask=volume_mask)
+
+            concat_mask = 1.0 - (drop_concat | drop_all).float()
+            x_concat = self.drop(x_concat, concat_mask)
+
+        if self.use_zero_123:
+            # zero123 does not multiply this when encoding, maybe a bug for zero123
+            first_stage_scale_factor = 0.18215
+            x_concat_ = x_concat * 1.0
+            x_concat_[:, :4] = x_concat_[:, :4] / first_stage_scale_factor
+        else:
+            x_concat_ = x_concat
+
+        x = torch.cat([x, x_concat_], 1)
+        pred = self.diffusion_model(x, t, clip_embed, source_dict=volume_feats)
+        return pred
+
+    def predict_with_unconditional_scale(self, x, t, clip_embed, volume_feats,
+                                         x_concat, unconditional_scale):
+        x_ = torch.cat([x] * 2, 0)
+        t_ = torch.cat([t] * 2, 0)
+        clip_embed_ = torch.cat([clip_embed, torch.zeros_like(clip_embed)], 0)
+
+        v_ = {}
+        for k, v in volume_feats.items():
+            v_[k] = torch.cat([v, torch.zeros_like(v)], 0)
+
+        x_concat_ = torch.cat([x_concat, torch.zeros_like(x_concat)], 0)
+        if self.use_zero_123:
+            # zero123 does not multiply this when encoding, maybe a bug for zero123
+            first_stage_scale_factor = 0.18215
+            x_concat_[:, :4] = x_concat_[:, :4] / first_stage_scale_factor
+        x_ = torch.cat([x_, x_concat_], 1)
+        s, s_uc = self.diffusion_model(
+            x_, t_, clip_embed_, source_dict=v_).chunk(2)
+        s = s_uc + unconditional_scale * (s - s_uc)
+        return s
+
+
+class SpatialVolumeNet(nn.Module):
+
+    def __init__(
+            self,
+            time_dim,
+            view_dim,
+            view_num,
+            input_image_size=256,
+            frustum_volume_depth=48,
+            spatial_volume_size=32,
+            spatial_volume_length=0.5,
+            frustum_volume_length=0.86603  # sqrt(3)/2
+    ):
+        super().__init__()
+        self.target_encoder = NoisyTargetViewEncoder(
+            time_dim, view_dim, output_dim=16)
+        self.spatial_volume_feats = SpatialTime3DNet(
+            input_dim=16 * view_num,
+            time_dim=time_dim,
+            dims=(64, 128, 256, 512))
+        self.frustum_volume_feats = FrustumTV3DNet(
+            64, time_dim, view_dim, dims=(64, 128, 256, 512))
+
+        self.frustum_volume_length = frustum_volume_length
+        self.input_image_size = input_image_size
+        self.spatial_volume_size = spatial_volume_size
+        self.spatial_volume_length = spatial_volume_length
+
+        self.frustum_volume_size = self.input_image_size // 8
+        self.frustum_volume_depth = frustum_volume_depth
+        self.time_dim = time_dim
+        self.view_dim = view_dim
+        # our rendered images are 1.5 away from the origin, we assume camera is 1.5 away from the origin
+        self.default_origin_depth = 1.5
+
+    def construct_spatial_volume(self, x, t_embed, v_embed, target_poses,
+                                 target_Ks):
+        """
+        @param x:            B,N,4,H,W
+        @param t_embed:      B,t_dim
+        @param v_embed:      B,N,v_dim
+        @param target_poses: N,3,4
+        @param target_Ks:    N,3,3
+        @return:
+        """
+        B, N, _, H, W = x.shape
+        V = self.spatial_volume_size
+        device = x.device
+
+        spatial_volume_verts = torch.linspace(
+            -self.spatial_volume_length,
+            self.spatial_volume_length,
+            V,
+            dtype=torch.float32,
+            device=device)
+        spatial_volume_verts = torch.stack(
+            torch.meshgrid(spatial_volume_verts, spatial_volume_verts,
+                           spatial_volume_verts), -1)
+        spatial_volume_verts = spatial_volume_verts.reshape(1, V**3,
+                                                            3)[:, :, (2, 1, 0)]
+        spatial_volume_verts = spatial_volume_verts.view(
+            1, V, V, V, 3).permute(0, 4, 1, 2, 3).repeat(B, 1, 1, 1, 1)
+
+        # encode source features
+        t_embed_ = t_embed.view(B, 1, self.time_dim).repeat(1, N, 1).view(
+            B, N, self.time_dim)
+        # v_embed_ = v_embed.view(1, N, self.view_dim).repeat(B, 1, 1).view(B, N, self.view_dim)
+        v_embed_ = v_embed
+        target_Ks = target_Ks.unsqueeze(0).repeat(B, 1, 1, 1)
+        target_poses = target_poses.unsqueeze(0).repeat(B, 1, 1, 1)
+
+        # extract 2D image features
+        spatial_volume_feats = []
+        # project source features
+        for ni in range(0, N):
+            pose_source_ = target_poses[:, ni]
+            K_source_ = target_Ks[:, ni]
+            x_ = self.target_encoder(x[:, ni], t_embed_[:, ni], v_embed_[:,
+                                                                         ni])
+            C = x_.shape[1]
+
+            coords_source = get_warp_coordinates(
+                spatial_volume_verts, x_.shape[-1], self.input_image_size,
+                K_source_, pose_source_).view(B, V, V * V, 2)
+            unproj_feats_ = F.grid_sample(
+                x_,
+                coords_source,
+                mode='bilinear',
+                padding_mode='zeros',
+                align_corners=True)
+            unproj_feats_ = unproj_feats_.view(B, C, V, V, V)
+            spatial_volume_feats.append(unproj_feats_)
+
+        spatial_volume_feats = torch.stack(spatial_volume_feats,
+                                           1)  # B,N,C,V,V,V
+        N = spatial_volume_feats.shape[1]
+        spatial_volume_feats = spatial_volume_feats.view(B, N * C, V, V, V)
+
+        spatial_volume_feats = self.spatial_volume_feats(
+            spatial_volume_feats, t_embed)  # b,64,32,32,32
+        return spatial_volume_feats
+
+    def construct_view_frustum_volume(self, spatial_volume, t_embed, v_embed,
+                                      poses, Ks, target_indices):
+        """
+        @param spatial_volume:    B,C,V,V,V
+        @param t_embed:           B,t_dim
+        @param v_embed:           B,N,v_dim
+        @param poses:             N,3,4
+        @param Ks:                N,3,3
+        @param target_indices:    B,TN
+        @return: B*TN,C,H,W
+        """
+        B, TN = target_indices.shape
+        H, W = self.frustum_volume_size, self.frustum_volume_size
+        D = self.frustum_volume_depth
+        V = self.spatial_volume_size
+
+        near = torch.ones(
+            B * TN,
+            1,
+            H,
+            W,
+            dtype=spatial_volume.dtype,
+            device=spatial_volume.device
+        ) * self.default_origin_depth - self.frustum_volume_length
+        far = torch.ones(
+            B * TN,
+            1,
+            H,
+            W,
+            dtype=spatial_volume.dtype,
+            device=spatial_volume.device
+        ) * self.default_origin_depth + self.frustum_volume_length
+
+        target_indices = target_indices.view(B * TN)  # B*TN
+        poses_ = poses[target_indices]  # B*TN,3,4
+        Ks_ = Ks[target_indices]  # B*TN,3,4
+        volume_xyz, volume_depth = create_target_volume(
+            D, self.frustum_volume_size, self.input_image_size, poses_, Ks_,
+            near, far)  # B*TN,3 or 1,D,H,W
+
+        # since the spatial volume is constructed in [-spatial_volume_length,spatial_volume_length]
+        volume_xyz_ = volume_xyz / self.spatial_volume_length
+        volume_xyz_ = volume_xyz_.permute(0, 2, 3, 4, 1)  # B*TN,D,H,W,3
+        spatial_volume_ = spatial_volume.unsqueeze(1).repeat(
+            1, TN, 1, 1, 1, 1).view(B * TN, -1, V, V, V)
+        volume_feats = F.grid_sample(
+            spatial_volume_,
+            volume_xyz_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=True)  # B*TN,C,D,H,W
+
+        v_embed_ = v_embed[torch.arange(B)[:, None],
+                           target_indices.view(B, TN)].view(B * TN, -1)  # B*TN
+        t_embed_ = t_embed.unsqueeze(1).repeat(1, TN, 1).view(B * TN, -1)
+        volume_feats_dict = self.frustum_volume_feats(volume_feats, t_embed_,
+                                                      v_embed_)
+        return volume_feats_dict, volume_depth
+
+
+"""
+    SyncDreamer is a SoTA Novel View Synthesis model which can generate 16 consistent views seamlessly.
+    Please refer to: https://arxiv.org/abs/2309.03453 for more technique details.
+"""
+
+
+class SyncMultiviewDiffusion(pl.LightningModule):
+
+    def __init__(
+        self,
+        unet_config,
+        scheduler_config,
+        finetune_unet=False,
+        finetune_projection=True,
+        view_num=16,
+        image_size=256,
+        cfg_scale=3.0,
+        output_num=8,
+        batch_view_num=4,
+        drop_conditions=False,
+        drop_scheme='default',
+        clip_image_encoder_path='/apdcephfs/private_rondyliu/projects/clip/ViT-L-14.pt'
+    ):
+        super().__init__()
+
+        self.finetune_unet = finetune_unet
+        self.finetune_projection = finetune_projection
+
+        self.view_num = view_num
+        self.viewpoint_dim = 4
+        self.output_num = output_num
+        self.image_size = image_size
+
+        self.batch_view_num = batch_view_num
+        self.cfg_scale = cfg_scale
+
+        self.clip_image_encoder_path = clip_image_encoder_path
+
+        self._init_time_step_embedding()
+        self._init_first_stage()
+        self._init_schedule()
+        self._init_multiview()
+        self._init_clip_image_encoder()
+        self._init_clip_projection()
+
+        self.spatial_volume = SpatialVolumeNet(self.time_embed_dim,
+                                               self.viewpoint_dim,
+                                               self.view_num)
+        self.model = UNetWrapper(
+            unet_config,
+            drop_conditions=drop_conditions,
+            drop_scheme=drop_scheme)
+        self.scheduler_config = scheduler_config
+
+        latent_size = image_size // 8
+        self.ddim = SyncDDIMSampler(
+            self, 200, 'uniform', 1.0, latent_size=latent_size)
+
+    def _init_clip_projection(self):
+        self.cc_projection = nn.Linear(772, 768)
+        nn.init.eye_(list(self.cc_projection.parameters())[0][:768, :768])
+        nn.init.zeros_(list(self.cc_projection.parameters())[1])
+        self.cc_projection.requires_grad_(True)
+
+        if not self.finetune_projection:
+            disable_training_module(self.cc_projection)
+
+    def _init_multiview(self):
+        K, azs, _, _, poses = read_pickle(
+            self.clip_image_encoder_path.replace(
+                'ViT-L-14.pt', f'camera-{self.view_num}.pkl'))
+        default_image_size = 256
+        ratio = self.image_size / default_image_size
+        K = np.diag([ratio, ratio, 1]) @ K
+        K = torch.from_numpy(K.astype(np.float32))  # [3,3]
+        K = K.unsqueeze(0).repeat(self.view_num, 1, 1)  # N,3,3
+        poses = torch.from_numpy(poses.astype(np.float32))  # N,3,4
+        self.register_buffer('poses', poses)
+        self.register_buffer('Ks', K)
+        azs = (azs + np.pi) % (
+            np.pi * 2) - np.pi  # scale to [-pi,pi] and the index=0 has az=0
+        self.register_buffer('azimuth',
+                             torch.from_numpy(azs.astype(np.float32)))
+
+    def get_viewpoint_embedding(self, batch_size, elevation_ref):
+        """
+        @param batch_size:
+        @param elevation_ref: B
+        @return:
+        """
+        azimuth_input = self.azimuth[0].unsqueeze(0)  # 1
+        azimuth_target = self.azimuth  # N
+        elevation_input = -elevation_ref  # note that zero123 use a negative elevation here!!!
+        elevation_target = -np.deg2rad(30)
+        d_e = elevation_target - elevation_input  # B
+        N = self.azimuth.shape[0]
+        B = batch_size
+        d_e = d_e.unsqueeze(1).repeat(1, N)
+        d_a = azimuth_target - azimuth_input  # N
+        d_a = d_a.unsqueeze(0).repeat(B, 1)
+        d_z = torch.zeros_like(d_a)
+        embedding = torch.stack(
+            [d_e, torch.sin(d_a), torch.cos(d_a), d_z], -1)  # B,N,4
+        return embedding
+
+    def _init_first_stage(self):
+        first_stage_config = {
+            'target':
+            'modelscope.models.cv.image_to_3d.ldm.models.autoencoder.AutoencoderKL',
+            'params': {
+                'embed_dim': 4,
+                'monitor': 'val/rec_loss',
+                'ddconfig': {
+                    'double_z': True,
+                    'z_channels': 4,
+                    'resolution': self.image_size,
+                    'in_channels': 3,
+                    'out_ch': 3,
+                    'ch': 128,
+                    'ch_mult': [1, 2, 4, 4],
+                    'num_res_blocks': 2,
+                    'attn_resolutions': [],
+                    'dropout': 0.0
+                },
+                'lossconfig': {
+                    'target': 'torch.nn.Identity'
+                },
+            }
+        }
+        self.first_stage_scale_factor = 0.18215
+        self.first_stage_model = instantiate_from_config(first_stage_config)
+        self.first_stage_model = disable_training_module(
+            self.first_stage_model)
+
+    def _init_clip_image_encoder(self):
+        self.clip_image_encoder = FrozenCLIPImageEmbedder(
+            model=self.clip_image_encoder_path)
+        self.clip_image_encoder = disable_training_module(
+            self.clip_image_encoder)
+
+    def _init_schedule(self):
+        self.num_timesteps = 1000
+        linear_start = 0.00085
+        linear_end = 0.0120
+        num_timesteps = 1000
+        betas = torch.linspace(
+            linear_start**0.5,
+            linear_end**0.5,
+            num_timesteps,
+            dtype=torch.float32)**2  # T
+        assert betas.shape[0] == self.num_timesteps
+
+        # all in float64 first
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)  # T
+        alphas_cumprod_prev = torch.cat(
+            [torch.ones(1, dtype=torch.float64), alphas_cumprod[:-1]], 0)
+        posterior_variance = betas * (1. - alphas_cumprod_prev) / (
+            1. - alphas_cumprod)  # T
+        posterior_log_variance_clipped = torch.log(
+            torch.clamp(posterior_variance, min=1e-20))
+        posterior_log_variance_clipped = torch.clamp(
+            posterior_log_variance_clipped, min=-10)
+
+        self.register_buffer('betas', betas.float())
+        self.register_buffer('alphas', alphas.float())
+        self.register_buffer('alphas_cumprod', alphas_cumprod.float())
+        self.register_buffer('sqrt_alphas_cumprod',
+                             torch.sqrt(alphas_cumprod).float())
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             torch.sqrt(1 - alphas_cumprod).float())
+        self.register_buffer('posterior_variance', posterior_variance.float())
+        self.register_buffer('posterior_log_variance_clipped',
+                             posterior_log_variance_clipped.float())
+
+    def _init_time_step_embedding(self):
+        self.time_embed_dim = 256
+        self.time_embed = nn.Sequential(
+            nn.Linear(self.time_embed_dim, self.time_embed_dim),
+            nn.SiLU(True),
+            nn.Linear(self.time_embed_dim, self.time_embed_dim),
+        )
+
+    def encode_first_stage(self, x, sample=True):
+        with torch.no_grad():
+            posterior = self.first_stage_model.encode(x)  # b,4,h//8,w//8
+            if sample:
+                return posterior.sample().detach(
+                ) * self.first_stage_scale_factor
+            else:
+                return posterior.mode().detach(
+                ) * self.first_stage_scale_factor
+
+    def decode_first_stage(self, z):
+        with torch.no_grad():
+            z = 1. / self.first_stage_scale_factor * z
+            return self.first_stage_model.decode(z)
+
+    def prepare(self, batch):
+        # encode target
+        if 'target_image' in batch:
+            image_target = batch['target_image'].permute(0, 1, 4, 2,
+                                                         3)  # b,n,3,h,w
+            N = image_target.shape[1]
+            x = [
+                self.encode_first_stage(image_target[:, ni], True)
+                for ni in range(N)
+            ]
+            x = torch.stack(x, 1)  # b,n,4,h//8,w//8
+        else:
+            x = None
+
+        image_input = batch['input_image'].permute(0, 3, 1, 2)
+        elevation_input = batch['input_elevation'][:, 0]  # b
+        x_input = self.encode_first_stage(image_input)
+        input_info = {
+            'image': image_input,
+            'elevation': elevation_input,
+            'x': x_input
+        }
+        with torch.no_grad():
+            clip_embed = self.clip_image_encoder.encode(image_input)
+        return x, clip_embed, input_info
+
+    def embed_time(self, t):
+        t_embed = timestep_embedding(
+            t, self.time_embed_dim, repeat_only=False)  # B,TED
+        t_embed = self.time_embed(t_embed)  # B,TED
+        return t_embed
+
+    def get_target_view_feats(self, x_input, spatial_volume, clip_embed,
+                              t_embed, v_embed, target_index):
+        """
+        @param x_input:        B,4,H,W
+        @param spatial_volume: B,C,V,V,V
+        @param clip_embed:     B,1,768
+        @param t_embed:        B,t_dim
+        @param v_embed:        B,N,v_dim
+        @param target_index:   B,TN
+        @return:
+            tensors of size B*TN,*
+        """
+        B, _, H, W = x_input.shape
+        frustum_volume_feats, frustum_volume_depth = self.spatial_volume.construct_view_frustum_volume(
+            spatial_volume, t_embed, v_embed, self.poses, self.Ks,
+            target_index)
+
+        # clip
+        TN = target_index.shape[1]
+        v_embed_ = v_embed[torch.arange(B)[:, None],
+                           target_index].view(B * TN,
+                                              self.viewpoint_dim)  # B*TN,v_dim
+        clip_embed_ = clip_embed.unsqueeze(1).repeat(1, TN, 1,
+                                                     1).view(B * TN, 1, 768)
+        clip_embed_ = self.cc_projection(
+            torch.cat([clip_embed_, v_embed_.unsqueeze(1)], -1))  # B*TN,1,768
+
+        x_input_ = x_input.unsqueeze(1).repeat(1, TN, 1, 1,
+                                               1).view(B * TN, 4, H, W)
+
+        x_concat = x_input_
+        return clip_embed_, frustum_volume_feats, x_concat
+
+    def training_step(self, batch):
+        B = batch['image'].shape[0]
+        time_steps = torch.randint(
+            0, self.num_timesteps, (B, ), device=self.device).long()
+
+        x, clip_embed, input_info = self.prepare(batch)
+        x_noisy, noise = self.add_noise(x, time_steps)  # B,N,4,H,W
+
+        N = self.view_num
+        target_index = torch.randint(
+            0, N, (B, 1), device=self.device).long()  # B, 1
+        v_embed = self.get_viewpoint_embedding(
+            B, input_info['elevation'])  # N,v_dim
+
+        t_embed = self.embed_time(time_steps)
+        spatial_volume = self.spatial_volume.construct_spatial_volume(
+            x_noisy, t_embed, v_embed, self.poses, self.Ks)
+
+        clip_embed, volume_feats, x_concat = self.get_target_view_feats(
+            input_info['x'], spatial_volume, clip_embed, t_embed, v_embed,
+            target_index)
+
+        x_noisy_ = x_noisy[torch.arange(B)[:, None],
+                           target_index][:, 0]  # B,4,H,W
+        noise_predict = self.model(
+            x_noisy_,
+            time_steps,
+            clip_embed,
+            volume_feats,
+            x_concat,
+            is_train=True)  # B,4,H,W
+
+        noise_target = noise[torch.arange(B)[:, None],
+                             target_index][:, 0]  # B,4,H,W
+        # loss simple for diffusion
+        loss_simple = torch.nn.functional.mse_loss(
+            noise_target, noise_predict, reduction='none')
+        loss = loss_simple.mean()
+        self.log(
+            'sim',
+            loss_simple.mean(),
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=True,
+            rank_zero_only=True)
+
+        # log others
+        lr = self.optimizers().param_groups[0]['lr']
+        self.log(
+            'lr',
+            lr,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            rank_zero_only=True)
+        self.log(
+            'step',
+            self.global_step,
+            prog_bar=True,
+            logger=True,
+            on_step=True,
+            on_epoch=False,
+            rank_zero_only=True)
+        return loss
+
+    def add_noise(self, x_start, t):
+        """
+        @param x_start: B,*
+        @param t:       B,
+        @return:
+        """
+        B = x_start.shape[0]
+        noise = torch.randn_like(x_start)  # B,*
+
+        sqrt_alphas_cumprod_ = self.sqrt_alphas_cumprod[t]  # B,
+        sqrt_one_minus_alphas_cumprod_ = self.sqrt_one_minus_alphas_cumprod[
+            t]  # B
+        sqrt_alphas_cumprod_ = sqrt_alphas_cumprod_.view(
+            B, *[1 for _ in range(len(x_start.shape) - 1)])
+        sqrt_one_minus_alphas_cumprod_ = sqrt_one_minus_alphas_cumprod_.view(
+            B, *[1 for _ in range(len(x_start.shape) - 1)])
+        x_noisy = sqrt_alphas_cumprod_ * x_start + sqrt_one_minus_alphas_cumprod_ * noise
+        return x_noisy, noise
+
+    def sample(self,
+               batch,
+               cfg_scale,
+               batch_view_num,
+               use_ddim=True,
+               return_inter_results=False,
+               inter_interval=50,
+               inter_view_interval=2):
+        _, clip_embed, input_info = self.prepare(batch)
+        if use_ddim:
+            x_sample, inter = self.ddim.sample(
+                input_info,
+                clip_embed,
+                unconditional_scale=cfg_scale,
+                log_every_t=inter_interval,
+                batch_view_num=batch_view_num)
+        else:
+            raise NotImplementedError
+
+        N = x_sample.shape[1]
+        x_sample = torch.stack(
+            [self.decode_first_stage(x_sample[:, ni]) for ni in range(N)], 1)
+        if return_inter_results:
+            torch.cuda.synchronize()
+            torch.cuda.empty_cache()
+            inter = torch.stack(inter['x_inter'], 2)  # # B,N,T,C,H,W
+            B, N, T, C, H, W = inter.shape
+            inter_results = []
+            for ni in tqdm(range(0, N, inter_view_interval)):
+                inter_results_ = []
+                for ti in range(T):
+                    inter_results_.append(
+                        self.decode_first_stage(inter[:, ni, ti]))
+                inter_results.append(torch.stack(inter_results_,
+                                                 1))  # B,T,3,H,W
+            inter_results = torch.stack(inter_results, 1)  # B,N,T,3,H,W
+            return x_sample, inter_results
+        else:
+            return x_sample
+
+    def log_image(self,
+                  x_sample,
+                  batch,
+                  step,
+                  output_dir,
+                  only_first_row=False):
+
+        def process(x):
+            return ((torch.clip(x, min=-1, max=1).cpu().numpy() * 0.5 + 0.5)
+                    * 255).astype(np.uint8)
+
+        B = x_sample.shape[0]
+        N = x_sample.shape[1]
+        image_cond = []
+        for bi in range(B):
+            img_pr_ = concat_images_list(
+                process(batch['ref_image'][bi]), *[
+                    process(x_sample[bi, ni].permute(1, 2, 0))
+                    for ni in range(N)
+                ])
+            img_gt_ = concat_images_list(
+                process(batch['ref_image'][bi]),
+                *[process(batch['image'][bi, ni]) for ni in range(N)])
+            if not only_first_row or bi == 0:
+                image_cond.append(
+                    concat_images_list(img_gt_, img_pr_, vert=True))
+            else:
+                image_cond.append(img_pr_)
+
+        output_dir = Path(output_dir)
+        imsave(
+            str(output_dir / f'{step}.jpg'),
+            concat_images_list(*image_cond, vert=True))
+
+    @torch.no_grad()
+    def validation_step(self, batch, batch_idx):
+        if batch_idx == 0 and self.global_rank == 0:
+            self.eval()
+            step = self.global_step
+            batch_ = {}
+            for k, v in batch.items():
+                batch_[k] = v[:self.output_num]
+            x_sample = self.sample(batch_, self.cfg_scale, self.batch_view_num)
+            output_dir = Path(self.image_dir) / 'images' / 'val'
+            output_dir.mkdir(exist_ok=True, parents=True)
+            self.log_image(x_sample, batch, step, output_dir=output_dir)
+
+    def configure_optimizers(self):
+        lr = self.learning_rate
+        print(f'setting learning rate to {lr:.4f} ...')
+        paras = []
+        if self.finetune_projection:
+            paras.append({
+                'params': self.cc_projection.parameters(),
+                'lr': lr
+            }, )
+        if self.finetune_unet:
+            paras.append({'params': self.model.parameters(), 'lr': lr}, )
+        else:
+            paras.append(
+                {
+                    'params': self.model.get_trainable_parameters(),
+                    'lr': lr
+                }, )
+
+        paras.append({
+            'params': self.time_embed.parameters(),
+            'lr': lr * 10.0
+        }, )
+        paras.append(
+            {
+                'params': self.spatial_volume.parameters(),
+                'lr': lr * 10.0
+            }, )
+
+        opt = torch.optim.AdamW(paras, lr=lr)
+
+        scheduler = instantiate_from_config(self.scheduler_config)
+        print('Setting up LambdaLR scheduler...')
+        scheduler = [{
+            'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule),
+            'interval': 'step',
+            'frequency': 1
+        }]
+        return [opt], scheduler
+
+
+class SyncDDIMSampler:
+
+    def __init__(self,
+                 model: SyncMultiviewDiffusion,
+                 ddim_num_steps,
+                 ddim_discretize='uniform',
+                 ddim_eta=0.,
+                 latent_size=32):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.latent_size = latent_size
+        self._make_schedule(ddim_num_steps, ddim_discretize, ddim_eta)
+        self.eta = ddim_eta
+
+    def _make_schedule(self,
+                       ddim_num_steps,
+                       ddim_discretize='uniform',
+                       ddim_eta=0.,
+                       verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(
+            ddim_discr_method=ddim_discretize,
+            num_ddim_timesteps=ddim_num_steps,
+            num_ddpm_timesteps=self.ddpm_num_timesteps,
+            verbose=verbose)  # DT
+        ddim_timesteps_ = torch.from_numpy(
+            self.ddim_timesteps.astype(np.int64))  # DT
+
+        alphas_cumprod = self.model.alphas_cumprod  # T
+        assert alphas_cumprod.shape[
+            0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        self.ddim_alphas = alphas_cumprod[ddim_timesteps_].double()  # DT
+        self.ddim_alphas_prev = torch.cat(
+            [alphas_cumprod[0:1], alphas_cumprod[ddim_timesteps_[:-1]]],
+            0)  # DT
+        self.ddim_sigmas = ddim_eta * torch.sqrt(  # noqa
+            (1 - self.ddim_alphas_prev) / (1 - self.ddim_alphas) *  # noqa
+            (1 - self.ddim_alphas / self.ddim_alphas_prev))  # noqa
+
+        self.ddim_alphas_raw = self.model.alphas[ddim_timesteps_].float()  # DT
+        self.ddim_sigmas = self.ddim_sigmas.float()
+        self.ddim_alphas = self.ddim_alphas.float()
+        self.ddim_alphas_prev = self.ddim_alphas_prev.float()
+        self.ddim_sqrt_one_minus_alphas = torch.sqrt(
+            1. - self.ddim_alphas).float()
+
+    @torch.no_grad()
+    def denoise_apply_impl(self,
+                           x_target_noisy,
+                           index,
+                           noise_pred,
+                           is_step0=False):
+        """
+        @param x_target_noisy: B,N,4,H,W
+        @param index:          index
+        @param noise_pred:     B,N,4,H,W
+        @param is_step0:       bool
+        @return:
+        """
+        device = x_target_noisy.device
+        B, N, _, H, W = x_target_noisy.shape
+
+        # apply noise
+        a_t = self.ddim_alphas[index].to(device).float().view(1, 1, 1, 1, 1)
+        a_prev = self.ddim_alphas_prev[index].to(device).float().view(
+            1, 1, 1, 1, 1)
+        sqrt_one_minus_at = self.ddim_sqrt_one_minus_alphas[index].to(
+            device).float().view(1, 1, 1, 1, 1)
+        sigma_t = self.ddim_sigmas[index].to(device).float().view(
+            1, 1, 1, 1, 1)
+
+        pred_x0 = (x_target_noisy
+                   - sqrt_one_minus_at * noise_pred) / a_t.sqrt()
+        dir_xt = torch.clamp(
+            1. - a_prev - sigma_t**2, min=1e-7).sqrt() * noise_pred
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt
+        if not is_step0:
+            noise = sigma_t * torch.randn_like(x_target_noisy)
+            x_prev = x_prev + noise
+        return x_prev
+
+    @torch.no_grad()
+    def denoise_apply(self,
+                      x_target_noisy,
+                      input_info,
+                      clip_embed,
+                      time_steps,
+                      index,
+                      unconditional_scale,
+                      batch_view_num=1,
+                      is_step0=False):
+        """
+        @param x_target_noisy:   B,N,4,H,W
+        @param input_info:
+        @param clip_embed:       B,M,768
+        @param time_steps:       B,
+        @param index:            int
+        @param unconditional_scale:
+        @param batch_view_num:   int
+        @param is_step0:         bool
+        @return:
+        """
+        x_input, elevation_input = input_info['x'], input_info['elevation']
+        B, N, C, H, W = x_target_noisy.shape
+
+        # construct source data
+        v_embed = self.model.get_viewpoint_embedding(
+            B, elevation_input)  # B,N,v_dim
+        t_embed = self.model.embed_time(time_steps)  # B,t_dim
+        spatial_volume = self.model.spatial_volume.construct_spatial_volume(
+            x_target_noisy, t_embed, v_embed, self.model.poses, self.model.Ks)
+
+        e_t = []
+        target_indices = torch.arange(N)  # N
+        for ni in range(0, N, batch_view_num):
+            x_target_noisy_ = x_target_noisy[:, ni:ni + batch_view_num]
+            VN = x_target_noisy_.shape[1]
+            x_target_noisy_ = x_target_noisy_.reshape(B * VN, C, H, W)
+
+            time_steps_ = repeat_to_batch(time_steps, B, VN)
+            target_indices_ = target_indices[ni:ni + batch_view_num].unsqueeze(
+                0).repeat(B, 1)
+            clip_embed_, volume_feats_, x_concat_ = self.model.get_target_view_feats(
+                x_input, spatial_volume, clip_embed, t_embed, v_embed,
+                target_indices_)
+            if unconditional_scale != 1.0:
+                noise = self.model.model.predict_with_unconditional_scale(
+                    x_target_noisy_, time_steps_, clip_embed_, volume_feats_,
+                    x_concat_, unconditional_scale)
+            else:
+                noise = self.model.model(
+                    x_target_noisy_,
+                    time_steps_,
+                    clip_embed_,
+                    volume_feats_,
+                    x_concat_,
+                    is_train=False)
+            e_t.append(noise.view(B, VN, 4, H, W))
+
+        e_t = torch.cat(e_t, 1)
+        x_prev = self.denoise_apply_impl(x_target_noisy, index, e_t, is_step0)
+        return x_prev
+
+    @torch.no_grad()
+    def sample(self,
+               input_info,
+               clip_embed,
+               unconditional_scale=1.0,
+               log_every_t=50,
+               batch_view_num=1):
+        """
+        @param input_info:      x, elevation
+        @param clip_embed:      B,M,768
+        @param unconditional_scale:
+        @param log_every_t:
+        @param batch_view_num:
+        @return:
+        """
+        print(f'unconditional scale {unconditional_scale:.1f}')
+        C, H, W = 4, self.latent_size, self.latent_size
+        B = clip_embed.shape[0]
+        N = self.model.view_num
+        device = self.model.device
+        x_target_noisy = torch.randn([B, N, C, H, W], device=device)
+
+        timesteps = self.ddim_timesteps
+        intermediates = {'x_inter': []}
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1  # index in ddim state
+            time_steps = torch.full((B, ),
+                                    step,
+                                    device=device,
+                                    dtype=torch.long)
+            x_target_noisy = self.denoise_apply(
+                x_target_noisy,
+                input_info,
+                clip_embed,
+                time_steps,
+                index,
+                unconditional_scale,
+                batch_view_num=batch_view_num,
+                is_step0=index == 0)
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(x_target_noisy)
+
+        return x_target_noisy, intermediates
diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py
new file mode 100644
index 000000000..2457746e1
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py
@@ -0,0 +1,195 @@
+import torch
+import torch.nn as nn
+
+from modelscope.models.cv.image_to_3d.ldm.modules.attention import (  # no qa
+    checkpoint, default, zero_module)
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.openaimodel import \
+    UNetModel
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import \
+    timestep_embedding
+
+
+class DepthAttention(nn.Module):
+
+    def __init__(self,
+                 query_dim,
+                 context_dim,
+                 heads,
+                 dim_head,
+                 output_bias=True):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = attention.default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.dim_head = dim_head
+
+        self.to_q = nn.Conv2d(query_dim, inner_dim, 1, 1, bias=False)
+        self.to_k = nn.Conv3d(context_dim, inner_dim, 1, 1, bias=False)
+        self.to_v = nn.Conv3d(context_dim, inner_dim, 1, 1, bias=False)
+        if output_bias:
+            self.to_out = nn.Conv2d(inner_dim, query_dim, 1, 1)
+        else:
+            self.to_out = nn.Conv2d(inner_dim, query_dim, 1, 1, bias=False)
+
+    def forward(self, x, context):
+        """
+
+        @param x:        b,f0,h,w
+        @param context:  b,f1,d,h,w
+        @return:
+        """
+        hn, hd = self.heads, self.dim_head
+        b, _, h, w = x.shape
+        b, _, d, h, w = context.shape
+
+        q = self.to_q(x).reshape(b, hn, hd, h, w)  # b,t,h,w
+        k = self.to_k(context).reshape(b, hn, hd, d, h, w)  # b,t,d,h,w
+        v = self.to_v(context).reshape(b, hn, hd, d, h, w)  # b,t,d,h,w
+
+        sim = torch.sum(q.unsqueeze(3) * k, 2) * self.scale  # b,hn,d,h,w
+        attn = sim.softmax(dim=2)
+
+        # b,hn,hd,d,h,w * b,hn,1,d,h,w
+        out = torch.sum(v * attn.unsqueeze(2), 3)  # b,hn,hd,h,w
+        out = out.reshape(b, hn * hd, h, w)
+        return self.to_out(out)
+
+
+class DepthTransformer(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 n_heads,
+                 d_head,
+                 context_dim=None,
+                 checkpoint=True):
+        super().__init__()
+        inner_dim = n_heads * d_head
+        self.proj_in = nn.Sequential(
+            nn.Conv2d(dim, inner_dim, 1, 1),
+            nn.GroupNorm(8, inner_dim),
+            nn.SiLU(True),
+        )
+        self.proj_context = nn.Sequential(
+            nn.Conv3d(context_dim, context_dim, 1, 1, bias=False),  # no bias
+            nn.GroupNorm(8, context_dim),
+            nn.ReLU(
+                True),  # only relu, because we want input is 0, output is 0
+        )
+        self.depth_attn = DepthAttention(
+            query_dim=inner_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            context_dim=context_dim,
+            output_bias=False
+        )  # is a self-attention if not self.disable_self_attn
+        self.proj_out = nn.Sequential(
+            nn.GroupNorm(8, inner_dim),
+            nn.ReLU(True),
+            nn.Conv2d(inner_dim, inner_dim, 3, 1, 1, bias=False),
+            nn.GroupNorm(8, inner_dim),
+            nn.ReLU(True),
+            attention.zero_module(
+                nn.Conv2d(inner_dim, dim, 3, 1, 1, bias=False)),
+        )
+        self.checkpoint = attention.checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(),
+                          self.checkpoint)
+
+    def _forward(self, x, context):
+        x_in = x
+        x = self.proj_in(x)
+        context = self.proj_context(context)
+        x = self.depth_attn(x, context)
+        x = self.proj_out(x) + x_in
+        return x
+
+
+class DepthWiseAttention(UNetModel):
+
+    def __init__(self, volume_dims=(5, 16, 32, 64), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # num_heads = 4
+        model_channels = kwargs['model_channels']
+        channel_mult = kwargs['channel_mult']
+        d0, d1, d2, d3 = volume_dims
+
+        # 4
+        ch = model_channels * channel_mult[2]
+        self.middle_conditions = DepthTransformer(
+            ch, 4, d3 // 2, context_dim=d3)
+
+        self.output_conditions = nn.ModuleList()
+        self.output_b2c = {
+            3: 0,
+            4: 1,
+            5: 2,
+            6: 3,
+            7: 4,
+            8: 5,
+            9: 6,
+            10: 7,
+            11: 8
+        }
+        # 8
+        ch = model_channels * channel_mult[2]
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d2 // 2, context_dim=d2))  # 0
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d2 // 2, context_dim=d2))  # 1
+        # 16
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d1 // 2, context_dim=d1))  # 2
+        ch = model_channels * channel_mult[1]
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d1 // 2, context_dim=d1))  # 3
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d1 // 2, context_dim=d1))  # 4
+        # 32
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d0 // 2, context_dim=d0))  # 5
+        ch = model_channels * channel_mult[0]
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d0 // 2, context_dim=d0))  # 6
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d0 // 2, context_dim=d0))  # 7
+        self.output_conditions.append(
+            DepthTransformer(ch, 4, d0 // 2, context_dim=d0))  # 8
+
+    def forward(self,
+                x,
+                timesteps=None,
+                context=None,
+                source_dict=None,
+                **kwargs):
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False)
+        emb = self.time_embed(t_emb)
+
+        h = x.type(self.dtype)
+        for index, module in enumerate(self.input_blocks):
+            h = module(h, emb, context)
+            hs.append(h)
+
+        h = self.middle_block(h, emb, context)
+        h = self.middle_conditions(h, context=source_dict[h.shape[-1]])
+
+        for index, module in enumerate(self.output_blocks):
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+            if index in self.output_b2c:
+                layer = self.output_conditions[self.output_b2c[index]]
+                h = layer(h, context=source_dict[h.shape[-1]])
+
+        h = h.type(x.dtype)
+        return self.out(h)
+
+    def get_trainable_parameters(self):
+        paras = [para for para in self.middle_conditions.parameters()
+                 ] + [para for para in self.output_conditions.parameters()]
+        return paras
diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_network.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_network.py
new file mode 100644
index 000000000..9b3d6616d
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_network.py
@@ -0,0 +1,233 @@
+import torch
+import torch.nn as nn
+
+
+class Image2DResBlockWithTV(nn.Module):
+
+    def __init__(self, dim, tdim, vdim):
+        super().__init__()
+
+        def norm(c):
+            return nn.GroupNorm(8, c)
+
+        self.time_embed = nn.Conv2d(tdim, dim, 1, 1)
+        self.view_embed = nn.Conv2d(vdim, dim, 1, 1)
+        self.conv = nn.Sequential(
+            norm(dim),
+            nn.SiLU(True),
+            nn.Conv2d(dim, dim, 3, 1, 1),
+            norm(dim),
+            nn.SiLU(True),
+            nn.Conv2d(dim, dim, 3, 1, 1),
+        )
+
+    def forward(self, x, t, v):
+        return x + self.conv(x + self.time_embed(t) + self.view_embed(v))
+
+
+class NoisyTargetViewEncoder(nn.Module):
+
+    def __init__(self,
+                 time_embed_dim,
+                 viewpoint_dim,
+                 run_dim=16,
+                 output_dim=8):
+        super().__init__()
+
+        self.init_conv = nn.Conv2d(4, run_dim, 3, 1, 1)
+        self.out_conv0 = Image2DResBlockWithTV(run_dim, time_embed_dim,
+                                               viewpoint_dim)
+        self.out_conv1 = Image2DResBlockWithTV(run_dim, time_embed_dim,
+                                               viewpoint_dim)
+        self.out_conv2 = Image2DResBlockWithTV(run_dim, time_embed_dim,
+                                               viewpoint_dim)
+        self.final_out = nn.Sequential(
+            nn.GroupNorm(8, run_dim), nn.SiLU(True),
+            nn.Conv2d(run_dim, output_dim, 3, 1, 1))
+
+    def forward(self, x, t, v):
+        B, DT = t.shape
+        t = t.view(B, DT, 1, 1)
+        B, DV = v.shape
+        v = v.view(B, DV, 1, 1)
+
+        x = self.init_conv(x)
+        x = self.out_conv0(x, t, v)
+        x = self.out_conv1(x, t, v)
+        x = self.out_conv2(x, t, v)
+        x = self.final_out(x)
+        return x
+
+
+class SpatialUpTimeBlock(nn.Module):
+
+    def __init__(self, x_in_dim, t_in_dim, out_dim):
+        super().__init__()
+
+        def norm_act(c):
+            return nn.GroupNorm(8, c)
+
+        self.t_conv = nn.Conv3d(t_in_dim, x_in_dim, 1, 1)  # 16
+        self.norm = norm_act(x_in_dim)
+        self.silu = nn.SiLU(True)
+        self.conv = nn.ConvTranspose3d(
+            x_in_dim,
+            out_dim,
+            kernel_size=3,
+            padding=1,
+            output_padding=1,
+            stride=2)
+
+    def forward(self, x, t):
+        x = x + self.t_conv(t)
+        return self.conv(self.silu(self.norm(x)))
+
+
+class SpatialTimeBlock(nn.Module):
+
+    def __init__(self, x_in_dim, t_in_dim, out_dim, stride):
+        super().__init__()
+
+        def norm_act(c):
+            return nn.GroupNorm(8, c)
+
+        self.t_conv = nn.Conv3d(t_in_dim, x_in_dim, 1, 1)  # 16
+        self.bn = norm_act(x_in_dim)
+        self.silu = nn.SiLU(True)
+        self.conv = nn.Conv3d(x_in_dim, out_dim, 3, stride=stride, padding=1)
+
+    def forward(self, x, t):
+        x = x + self.t_conv(t)
+        return self.conv(self.silu(self.bn(x)))
+
+
+class SpatialTime3DNet(nn.Module):
+
+    def __init__(self, time_dim=256, input_dim=128, dims=(32, 64, 128, 256)):
+        super().__init__()
+        d0, d1, d2, d3 = dims
+        dt = time_dim
+
+        self.init_conv = nn.Conv3d(input_dim, d0, 3, 1, 1)  # 32
+        self.conv0 = SpatialTimeBlock(d0, dt, d0, stride=1)
+
+        self.conv1 = SpatialTimeBlock(d0, dt, d1, stride=2)
+        self.conv2_0 = SpatialTimeBlock(d1, dt, d1, stride=1)
+        self.conv2_1 = SpatialTimeBlock(d1, dt, d1, stride=1)
+
+        self.conv3 = SpatialTimeBlock(d1, dt, d2, stride=2)
+        self.conv4_0 = SpatialTimeBlock(d2, dt, d2, stride=1)
+        self.conv4_1 = SpatialTimeBlock(d2, dt, d2, stride=1)
+
+        self.conv5 = SpatialTimeBlock(d2, dt, d3, stride=2)
+        self.conv6_0 = SpatialTimeBlock(d3, dt, d3, stride=1)
+        self.conv6_1 = SpatialTimeBlock(d3, dt, d3, stride=1)
+
+        self.conv7 = SpatialUpTimeBlock(d3, dt, d2)
+        self.conv8 = SpatialUpTimeBlock(d2, dt, d1)
+        self.conv9 = SpatialUpTimeBlock(d1, dt, d0)
+
+    def forward(self, x, t):
+        B, C = t.shape
+        t = t.view(B, C, 1, 1, 1)
+
+        x = self.init_conv(x)
+        conv0 = self.conv0(x, t)
+
+        x = self.conv1(conv0, t)
+        x = self.conv2_0(x, t)
+        conv2 = self.conv2_1(x, t)
+
+        x = self.conv3(conv2, t)
+        x = self.conv4_0(x, t)
+        conv4 = self.conv4_1(x, t)
+
+        x = self.conv5(conv4, t)
+        x = self.conv6_0(x, t)
+        x = self.conv6_1(x, t)
+
+        x = conv4 + self.conv7(x, t)
+        x = conv2 + self.conv8(x, t)
+        x = conv0 + self.conv9(x, t)
+        return x
+
+
+class FrustumTVBlock(nn.Module):
+
+    def __init__(self, x_dim, t_dim, v_dim, out_dim, stride):
+        super().__init__()
+
+        def norm_act(c):
+            return nn.GroupNorm(8, c)
+
+        self.t_conv = nn.Conv3d(t_dim, x_dim, 1, 1)  # 16
+        self.v_conv = nn.Conv3d(v_dim, x_dim, 1, 1)  # 16
+        self.bn = norm_act(x_dim)
+        self.silu = nn.SiLU(True)
+        self.conv = nn.Conv3d(x_dim, out_dim, 3, stride=stride, padding=1)
+
+    def forward(self, x, t, v):
+        x = x + self.t_conv(t) + self.v_conv(v)
+        return self.conv(self.silu(self.bn(x)))
+
+
+class FrustumTVUpBlock(nn.Module):
+
+    def __init__(self, x_dim, t_dim, v_dim, out_dim):
+        super().__init__()
+
+        def norm_act(c):
+            return nn.GroupNorm(8, c)
+
+        self.t_conv = nn.Conv3d(t_dim, x_dim, 1, 1)  # 16
+        self.v_conv = nn.Conv3d(v_dim, x_dim, 1, 1)  # 16
+        self.norm = norm_act(x_dim)
+        self.silu = nn.SiLU(True)
+        self.conv = nn.ConvTranspose3d(
+            x_dim,
+            out_dim,
+            kernel_size=3,
+            padding=1,
+            output_padding=1,
+            stride=2)
+
+    def forward(self, x, t, v):
+        x = x + self.t_conv(t) + self.v_conv(v)
+        return self.conv(self.silu(self.norm(x)))
+
+
+class FrustumTV3DNet(nn.Module):
+
+    def __init__(self, in_dim, t_dim, v_dim, dims=(32, 64, 128, 256)):
+        super().__init__()
+        self.conv0 = nn.Conv3d(in_dim, dims[0], 3, 1, 1)  # 32
+
+        self.conv1 = FrustumTVBlock(dims[0], t_dim, v_dim, dims[1], 2)
+        self.conv2 = FrustumTVBlock(dims[1], t_dim, v_dim, dims[1], 1)
+
+        self.conv3 = FrustumTVBlock(dims[1], t_dim, v_dim, dims[2], 2)
+        self.conv4 = FrustumTVBlock(dims[2], t_dim, v_dim, dims[2], 1)
+
+        self.conv5 = FrustumTVBlock(dims[2], t_dim, v_dim, dims[3], 2)
+        self.conv6 = FrustumTVBlock(dims[3], t_dim, v_dim, dims[3], 1)
+
+        self.up0 = FrustumTVUpBlock(dims[3], t_dim, v_dim, dims[2])
+        self.up1 = FrustumTVUpBlock(dims[2], t_dim, v_dim, dims[1])
+        self.up2 = FrustumTVUpBlock(dims[1], t_dim, v_dim, dims[0])
+
+    def forward(self, x, t, v):
+        B, DT = t.shape
+        t = t.view(B, DT, 1, 1, 1)
+        B, DV = v.shape
+        v = v.view(B, DV, 1, 1, 1)
+
+        b, _, d, h, w = x.shape
+        x0 = self.conv0(x)
+        x1 = self.conv2(self.conv1(x0, t, v), t, v)
+        x2 = self.conv4(self.conv3(x1, t, v), t, v)
+        x3 = self.conv6(self.conv5(x2, t, v), t, v)
+
+        x2 = self.up0(x3, t, v) + x2
+        x1 = self.up1(x2, t, v) + x1
+        x0 = self.up2(x1, t, v) + x0
+        return {w: x0, w // 2: x1, w // 4: x2, w // 8: x3}
diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_utils.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_utils.py
new file mode 100644
index 000000000..e7f2921ff
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_utils.py
@@ -0,0 +1,130 @@
+import torch
+from kornia import create_meshgrid
+
+
+def project_and_normalize(ref_grid, src_proj, length):
+    """
+
+    @param ref_grid: b 3 n
+    @param src_proj: b 4 4
+    @param length:   int
+    @return:  b, n, 2
+    """
+    src_grid = src_proj[:, :3, :3] @ ref_grid + src_proj[:, :3, 3:]  # b 3 n
+    div_val = src_grid[:, -1:]
+    div_val[div_val < 1e-4] = 1e-4
+    src_grid = src_grid[:, :2] / div_val  # divide by depth (b, 2, n)
+    src_grid[:, 0] = src_grid[:, 0] / ((length - 1) / 2) - 1  # scale to -1~1
+    src_grid[:, 1] = src_grid[:, 1] / ((length - 1) / 2) - 1  # scale to -1~1
+    src_grid = src_grid.permute(0, 2, 1)  # (b, n, 2)
+    return src_grid
+
+
+def construct_project_matrix(x_ratio, y_ratio, Ks, poses):
+    """
+    @param x_ratio: float
+    @param y_ratio: float
+    @param Ks:      b,3,3
+    @param poses:   b,3,4
+    @return:
+    """
+    rfn = Ks.shape[0]
+    scale_m = torch.tensor([x_ratio, y_ratio, 1.0],
+                           dtype=torch.float32,
+                           device=Ks.device)
+    scale_m = torch.diag(scale_m)
+    ref_prj = scale_m[None, :, :] @ Ks @ poses  # rfn,3,4
+    pad_vals = torch.zeros([rfn, 1, 4],
+                           dtype=torch.float32,
+                           device=ref_prj.device)
+    pad_vals[:, :, 3] = 1.0
+    ref_prj = torch.cat([ref_prj, pad_vals], 1)  # rfn,4,4
+    return ref_prj
+
+
+def get_warp_coordinates(volume_xyz, warp_size, input_size, Ks, warp_pose):
+    B, _, D, H, W = volume_xyz.shape
+    ratio = warp_size / input_size
+    warp_proj = construct_project_matrix(ratio, ratio, Ks, warp_pose)  # B,4,4
+    warp_coords = project_and_normalize(
+        volume_xyz.view(B, 3, D * H * W), warp_proj,
+        warp_size).view(B, D, H, W, 2)
+    return warp_coords
+
+
+def create_target_volume(depth_size,
+                         volume_size,
+                         input_image_size,
+                         pose_target,
+                         K,
+                         near=None,
+                         far=None):
+    device, dtype = pose_target.device, pose_target.dtype
+
+    # compute a depth range on the unit sphere
+    H, W, D, B = volume_size, volume_size, depth_size, pose_target.shape[0]
+    if near is not None and far is not None:
+        # near, far b,1,h,w
+        depth_values = torch.linspace(
+            0, 1, steps=depth_size).to(near.device).to(near.dtype)  # d
+        depth_values = depth_values.view(1, D, 1, 1)  # 1,d,1,1
+        depth_values = depth_values * (far - near) + near  # b d h w
+        depth_values = depth_values.view(B, 1, D, H * W)
+    else:
+        near, far = near_far_from_unit_sphere_using_camera_poses(
+            pose_target)  # b 1
+        depth_values = torch.linspace(
+            0, 1, steps=depth_size).to(near.device).to(near.dtype)  # d
+        depth_values = depth_values[None, :, None] * (
+            far[:, None, :] - near[:, None, :]) + near[:, None, :]  # b d 1
+        depth_values = depth_values.view(B, 1, D, 1).expand(B, 1, D, H * W)
+
+    ratio = volume_size / input_image_size
+
+    # creat a grid on the target (reference) view
+    # H, W, D, B = volume_size, volume_size, depth_values.shape[1], depth_values.shape[0]
+
+    # creat mesh grid: note reference also means target
+    ref_grid = create_meshgrid(
+        H, W, normalized_coordinates=False)  # (1, H, W, 2)
+    ref_grid = ref_grid.to(device).to(dtype)
+    ref_grid = ref_grid.permute(0, 3, 1, 2)  # (1, 2, H, W)
+    ref_grid = ref_grid.reshape(1, 2, H * W)  # (1, 2, H*W)
+    ref_grid = ref_grid.expand(B, -1, -1)  # (B, 2, H*W)
+    ref_grid = torch.cat(
+        (ref_grid,
+         torch.ones(B, 1, H * W, dtype=ref_grid.dtype,
+                    device=ref_grid.device)),
+        dim=1)  # (B, 3, H*W)
+    ref_grid = ref_grid.unsqueeze(2) * depth_values  # (B, 3, D, H*W)
+
+    # unproject to space and transfer to world coordinates.
+    Ks = K
+    ref_proj = construct_project_matrix(ratio, ratio, Ks, pose_target)  # B,4,4
+    ref_proj_inv = torch.inverse(ref_proj)  # B,4,4
+    ref_grid = ref_proj_inv[:, :3, :3] @ ref_grid.view(
+        B, 3, D * H
+        * W) + ref_proj_inv[:, :3, 3:]  # B,3,3 @ B,3,DHW + B,3,1 => B,3,DHW
+    return ref_grid.reshape(B, 3, D, H, W), depth_values.view(B, 1, D, H, W)
+
+
+def near_far_from_unit_sphere_using_camera_poses(camera_poses):
+    """
+    @param camera_poses: b 3 4
+    @return:
+        near: b,1
+        far: b,1
+    """
+    R_w2c = camera_poses[..., :3, :3]  # b 3 3
+    t_w2c = camera_poses[..., :3, 3:]  # b 3 1
+    camera_origin = -R_w2c.permute(0, 2, 1) @ t_w2c  # b 3 1
+    # R_w2c.T @ (0,0,1) = z_dir
+    camera_orient = R_w2c.permute(0, 2, 1)[..., :3, 2:3]  # b 3 1
+    camera_origin, camera_orient = camera_origin[...,
+                                                 0], camera_orient[...,
+                                                                   0]  # b 3
+    a = torch.sum(camera_orient**2, dim=-1, keepdim=True)  # b 1
+    b = -torch.sum(camera_orient * camera_origin, dim=-1, keepdim=True)  # b 1
+    mid = b / a  # b 1
+    near, far = mid - 1.0, mid + 1.0
+    return near, far
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/attention.py b/modelscope/models/cv/image_to_3d/ldm/modules/attention.py
new file mode 100644
index 000000000..aeab0a064
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/attention.py
@@ -0,0 +1,382 @@
+import math
+from inspect import isfunction
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from torch import einsum, nn
+
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import \
+    checkpoint
+
+
+def exists(val):
+    return val is not None
+
+
+def uniq(arr):
+    return {el: True for el in arr}.keys()
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def max_neg_value(t):
+    return -torch.finfo(t.dtype).max
+
+
+def init_(tensor):
+    dim = tensor.shape[-1]
+    std = 1 / math.sqrt(dim)
+    tensor.uniform_(-std, std)
+    return tensor
+
+
+# feedforward
+class GEGLU(nn.Module):
+
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+# feedforward
+class ConvGEGLU(nn.Module):
+
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Conv2d(dim_in, dim_out * 2, 1, 1, 0)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(nn.Linear(
+            dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(project_in, nn.Dropout(dropout),
+                                 nn.Linear(inner_dim, dim_out))
+
+    def forward(self, x):
+        return self.net(x)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def Normalize(in_channels):
+    return torch.nn.GroupNorm(
+        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class LinearAttention(nn.Module):
+
+    def __init__(self, dim, heads=4, dim_head=32):
+        super().__init__()
+        self.heads = heads
+        hidden_dim = dim_head * heads
+        self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False)
+        self.to_out = nn.Conv2d(hidden_dim, dim, 1)
+
+    def forward(self, x):
+        b, c, h, w = x.shape
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(
+            qkv,
+            'b (qkv heads c) h w -> qkv b heads c (h w)',
+            heads=self.heads,
+            qkv=3)
+        k = k.softmax(dim=-1)
+        context = torch.einsum('bhdn,bhen->bhde', k, v)
+        out = torch.einsum('bhde,bhdn->bhen', context, q)
+        out = rearrange(
+            out,
+            'b heads c (h w) -> b (heads c) h w',
+            heads=self.heads,
+            h=h,
+            w=w)
+        return self.to_out(out)
+
+
+class SpatialSelfAttention(nn.Module):
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = rearrange(q, 'b c h w -> b (h w) c')
+        k = rearrange(k, 'b c h w -> b c (h w)')
+        w_ = torch.einsum('bij,bjk->bik', q, k)
+
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = rearrange(v, 'b c h w -> b c (h w)')
+        w_ = rearrange(w_, 'b i j -> b j i')
+        h_ = torch.einsum('bij,bjk->bik', v, w_)
+        h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h)
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+class CrossAttention(nn.Module):
+
+    def __init__(self,
+                 query_dim,
+                 context_dim=None,
+                 heads=8,
+                 dim_head=64,
+                 dropout=0.):
+        super().__init__()
+        inner_dim = dim_head * heads
+        context_dim = default(context_dim, query_dim)
+
+        self.scale = dim_head**-0.5
+        self.heads = heads
+
+        self.to_q = nn.Linear(query_dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(context_dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(context_dim, inner_dim, bias=False)
+
+        self.to_out = nn.Sequential(
+            nn.Linear(inner_dim, query_dim), nn.Dropout(dropout))
+
+    def forward(self, x, context=None, mask=None):
+        h = self.heads
+
+        q = self.to_q(x)
+        context = default(context, x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h),
+                      (q, k, v))
+
+        sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
+
+        if exists(mask):
+            mask = mask > 0
+            mask = rearrange(mask, 'b ... -> b (...)')
+            max_neg_value = -torch.finfo(sim.dtype).max
+            mask = repeat(mask, 'b j -> (b h) () j', h=h)
+            sim.masked_fill_(~mask, max_neg_value)
+
+        # attention, what we cannot get enough of
+        attn = sim.softmax(dim=-1)
+
+        out = einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
+        return self.to_out(out)
+
+
+class BasicSpatialTransformer(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 n_heads,
+                 d_head,
+                 context_dim=None,
+                 checkpoint=True):
+        super().__init__()
+        inner_dim = n_heads * d_head
+        self.proj_in = nn.Sequential(
+            nn.GroupNorm(8, dim),
+            nn.Conv2d(dim, inner_dim, kernel_size=1, stride=1, padding=0),
+            nn.GroupNorm(8, inner_dim),
+            nn.ReLU(True),
+        )
+        self.attn = CrossAttention(
+            query_dim=inner_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            context_dim=context_dim
+        )  # is a self-attention if not self.disable_self_attn
+        self.out_conv = nn.Sequential(
+            nn.GroupNorm(8, inner_dim),
+            nn.ReLU(True),
+            nn.Conv2d(inner_dim, inner_dim, 1, 1),
+        )
+        self.proj_out = nn.Sequential(
+            nn.GroupNorm(8, inner_dim),
+            nn.ReLU(True),
+            zero_module(
+                nn.Conv2d(inner_dim, dim, kernel_size=1, stride=1, padding=0)),
+        )
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(),
+                          self.checkpoint)
+
+    def _forward(self, x, context):
+        # input
+        b, _, h, w = x.shape
+        x_in = x
+        x = self.proj_in(x)
+
+        # attention
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        context = rearrange(context, 'b c h w -> b (h w) c').contiguous()
+        x = self.attn(x, context) + x
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+
+        # output
+        x = self.out_conv(x) + x
+        x = self.proj_out(x) + x_in
+        return x
+
+
+class BasicTransformerBlock(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 n_heads,
+                 d_head,
+                 dropout=0.,
+                 context_dim=None,
+                 gated_ff=True,
+                 checkpoint=True,
+                 disable_self_attn=False):
+        super().__init__()
+        self.disable_self_attn = disable_self_attn
+        self.attn1 = CrossAttention(
+            query_dim=dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout,
+            context_dim=context_dim if self.disable_self_attn else
+            None)  # is a self-attention if not self.disable_self_attn
+        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff)
+        self.attn2 = CrossAttention(
+            query_dim=dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            dropout=dropout)  # is self-attn if context is none
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+        self.norm3 = nn.LayerNorm(dim)
+        self.checkpoint = checkpoint
+
+    def forward(self, x, context=None):
+        return checkpoint(self._forward, (x, context), self.parameters(),
+                          self.checkpoint)
+
+    def _forward(self, x, context=None):
+        x = self.attn1(
+            self.norm1(x),
+            context=context if self.disable_self_attn else None) + x
+        x = self.attn2(self.norm2(x), context=context) + x
+        x = self.ff(self.norm3(x)) + x
+        return x
+
+
+class ConvFeedForward(nn.Module):
+
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(
+            nn.Conv2d(dim, inner_dim, 1, 1, 0),
+            nn.GELU()) if not glu else ConvGEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(project_in, nn.Dropout(dropout),
+                                 nn.Conv2d(inner_dim, dim_out, 1, 1, 0))
+
+    def forward(self, x):
+        return self.net(x)
+
+
+class SpatialTransformer(nn.Module):
+    """
+    Transformer block for image-like data.
+    First, project the input (aka embedding)
+    and reshape to b, t, d.
+    Then apply standard transformer action.
+    Finally, reshape to image
+    """
+
+    def __init__(self,
+                 in_channels,
+                 n_heads,
+                 d_head,
+                 depth=1,
+                 dropout=0.,
+                 context_dim=None,
+                 disable_self_attn=False):
+        super().__init__()
+        self.in_channels = in_channels
+        inner_dim = n_heads * d_head
+        self.norm = Normalize(in_channels)
+
+        self.proj_in = nn.Conv2d(
+            in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+
+        self.transformer_blocks = nn.ModuleList([
+            BasicTransformerBlock(
+                inner_dim,
+                n_heads,
+                d_head,
+                dropout=dropout,
+                context_dim=context_dim,
+                disable_self_attn=disable_self_attn) for d in range(depth)
+        ])
+
+        self.proj_out = zero_module(
+            nn.Conv2d(
+                inner_dim, in_channels, kernel_size=1, stride=1, padding=0))
+
+    def forward(self, x, context=None):
+        # note: if no context is given, cross-attention defaults to self-attention
+        b, c, h, w = x.shape
+        x_in = x
+        x = self.norm(x)
+        x = self.proj_in(x)
+        x = rearrange(x, 'b c h w -> b (h w) c').contiguous()
+        for block in self.transformer_blocks:
+            x = block(x, context=context)
+        x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous()
+        x = self.proj_out(x)
+        return x + x_in
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/model.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/model.py
new file mode 100644
index 000000000..83780c98e
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/model.py
@@ -0,0 +1,964 @@
+# pytorch_diffusion + derived encoder decoder
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+
+from modelscope.models.cv.image_to_3d.ldm.modules.attention import \
+    LinearAttention
+from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config
+
+
+def get_timestep_embedding(timesteps, embedding_dim):
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    From Fairseq.
+    Build sinusoidal embeddings.
+    This matches the implementation in tensor2tensor, but differs slightly
+    from the description in Section 3.5 of "Attention Is All You Need".
+    """
+    assert len(timesteps.shape) == 1
+
+    half_dim = embedding_dim // 2
+    emb = math.log(10000) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb)
+    emb = emb.to(device=timesteps.device)
+    emb = timesteps.float()[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+
+
+def nonlinearity(x):
+    # swish
+    return x * torch.sigmoid(x)
+
+
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(
+        num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+
+
+class Upsample(nn.Module):
+
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(
+            x, scale_factor=2.0, mode='nearest')
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+
+
+class Downsample(nn.Module):
+
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0, 1, 0, 1)
+            x = torch.nn.functional.pad(x, pad, mode='constant', value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+
+
+class ResnetBlock(nn.Module):
+
+    def __init__(self,
+                 *,
+                 in_channels,
+                 out_channels=None,
+                 conv_shortcut=False,
+                 dropout,
+                 temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(
+                    in_channels,
+                    out_channels,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0)
+
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
+
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+
+        return x + h
+
+
+class LinAttnBlock(LinearAttention):
+    """to match AttnBlock usage"""
+
+    def __init__(self, in_channels):
+        super().__init__(dim=in_channels, heads=1, dim_head=in_channels)
+
+
+class AttnBlock(nn.Module):
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(
+            in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+
+        # compute attention
+        b, c, h, w = q.shape
+        q = q.reshape(b, c, h * w)
+        q = q.permute(0, 2, 1)  # b,hw,c
+        k = k.reshape(b, c, h * w)  # b,c,hw
+        w_ = torch.bmm(q, k)  # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+
+        # attend to values
+        v = v.reshape(b, c, h * w)
+        w_ = w_.permute(0, 2, 1)  # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(
+            v, w_)  # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b, c, h, w)
+
+        h_ = self.proj_out(h_)
+
+        return x + h_
+
+
+def make_attn(in_channels, attn_type='vanilla'):
+    assert attn_type in ['vanilla', 'linear',
+                         'none'], f'attn_type {attn_type} unknown'
+    print(
+        f"making attention of type '{attn_type}' with {in_channels} in_channels"
+    )
+    if attn_type == 'vanilla':
+        return AttnBlock(in_channels)
+    elif attn_type == 'none':
+        return nn.Identity(in_channels)
+    else:
+        return LinAttnBlock(in_channels)
+
+
+class Model(nn.Module):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 use_timestep=True,
+                 use_linear_attn=False,
+                 attn_type='vanilla'):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = 'linear'
+        self.ch = ch
+        self.temb_ch = self.ch * 4
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        self.use_timestep = use_timestep
+        if self.use_timestep:
+            # timestep embedding
+            self.temb = nn.Module()
+            self.temb.dense = nn.ModuleList([
+                torch.nn.Linear(self.ch, self.temb_ch),
+                torch.nn.Linear(self.temb_ch, self.temb_ch),
+            ])
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1, ) + tuple(ch_mult)
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            skip_in = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                if i_block == self.num_res_blocks:
+                    skip_in = ch * in_ch_mult[i_level]
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in + skip_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x, t=None, context=None):
+        # assert x.shape[2] == x.shape[3] == self.resolution
+        if context is not None:
+            # assume aligned context, cat along channel axis
+            x = torch.cat((x, context), dim=1)
+        if self.use_timestep:
+            # timestep embedding
+            assert t is not None
+            temb = get_timestep_embedding(t, self.ch)
+            temb = self.temb.dense[0](temb)
+            temb = nonlinearity(temb)
+            temb = self.temb.dense[1](temb)
+        else:
+            temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()],
+                                                              dim=1), temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+    def get_last_layer(self):
+        return self.conv_out.weight
+
+
+class Encoder(nn.Module):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 z_channels,
+                 double_z=True,
+                 use_linear_attn=False,
+                 attn_type='vanilla',
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = 'linear'
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(
+            in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_ch_mult = (1, ) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * z_channels if double_z else z_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def forward(self, x):
+        # timestep embedding
+        temb = None
+
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class Decoder(nn.Module):
+
+    def __init__(self,
+                 *,
+                 ch,
+                 out_ch,
+                 ch_mult=(1, 2, 4, 8),
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 in_channels,
+                 resolution,
+                 z_channels,
+                 give_pre_end=False,
+                 tanh_out=False,
+                 use_linear_attn=False,
+                 attn_type='vanilla',
+                 **ignorekwargs):
+        super().__init__()
+        if use_linear_attn:
+            attn_type = 'linear'
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        # in_ch_mult = (1, ) + tuple(ch_mult)
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2**(self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        print('Working with z of shape {} = {} dimensions.'.format(
+            self.z_shape, np.prod(self.z_shape)))
+
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(
+            z_channels, block_in, kernel_size=3, stride=1, padding=1)
+
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+        self.mid.attn_1 = make_attn(block_in, attn_type=attn_type)
+        self.mid.block_2 = ResnetBlock(
+            in_channels=block_in,
+            out_channels=block_in,
+            temb_channels=self.temb_ch,
+            dropout=dropout)
+
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(make_attn(block_in, attn_type=attn_type))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_ch, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, z):
+        # assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+
+        # timestep embedding
+        temb = None
+
+        # z to block_in
+        h = self.conv_in(z)
+
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+
+        # end
+        if self.give_pre_end:
+            return h
+
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
+
+
+class SimpleDecoder(nn.Module):
+
+    def __init__(self, in_channels, out_channels, *args, **kwargs):
+        super().__init__()
+        self.model = nn.ModuleList([
+            nn.Conv2d(in_channels, in_channels, 1),
+            ResnetBlock(
+                in_channels=in_channels,
+                out_channels=2 * in_channels,
+                temb_channels=0,
+                dropout=0.0),
+            ResnetBlock(
+                in_channels=2 * in_channels,
+                out_channels=4 * in_channels,
+                temb_channels=0,
+                dropout=0.0),
+            ResnetBlock(
+                in_channels=4 * in_channels,
+                out_channels=2 * in_channels,
+                temb_channels=0,
+                dropout=0.0),
+            nn.Conv2d(2 * in_channels, in_channels, 1),
+            Upsample(in_channels, with_conv=True)
+        ])
+        # end
+        self.norm_out = Normalize(in_channels)
+        self.conv_out = torch.nn.Conv2d(
+            in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        for i, layer in enumerate(self.model):
+            if i in [1, 2, 3]:
+                x = layer(x, None)
+            else:
+                x = layer(x)
+
+        h = self.norm_out(x)
+        h = nonlinearity(h)
+        x = self.conv_out(h)
+        return x
+
+
+class UpsampleDecoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 ch,
+                 num_res_blocks,
+                 resolution,
+                 ch_mult=(2, 2),
+                 dropout=0.0):
+        super().__init__()
+        # upsampling
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        block_in = in_channels
+        curr_res = resolution // 2**(self.num_resolutions - 1)
+        self.res_blocks = nn.ModuleList()
+        self.upsample_blocks = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            res_block = []
+            block_out = ch * ch_mult[i_level]
+            for i_block in range(self.num_res_blocks + 1):
+                res_block.append(
+                    ResnetBlock(
+                        in_channels=block_in,
+                        out_channels=block_out,
+                        temb_channels=self.temb_ch,
+                        dropout=dropout))
+                block_in = block_out
+            self.res_blocks.append(nn.ModuleList(res_block))
+            if i_level != self.num_resolutions - 1:
+                self.upsample_blocks.append(Upsample(block_in, True))
+                curr_res = curr_res * 2
+
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(
+            block_in, out_channels, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, x):
+        # upsampling
+        h = x
+        for k, i_level in enumerate(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.res_blocks[i_level][i_block](h, None)
+            if i_level != self.num_resolutions - 1:
+                h = self.upsample_blocks[k](h)
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        return h
+
+
+class LatentRescaler(nn.Module):
+
+    def __init__(self,
+                 factor,
+                 in_channels,
+                 mid_channels,
+                 out_channels,
+                 depth=2):
+        super().__init__()
+        # residual block, interpolate, residual block
+        self.factor = factor
+        self.conv_in = nn.Conv2d(
+            in_channels, mid_channels, kernel_size=3, stride=1, padding=1)
+        self.res_block1 = nn.ModuleList([
+            ResnetBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                temb_channels=0,
+                dropout=0.0) for _ in range(depth)
+        ])
+        self.attn = AttnBlock(mid_channels)
+        self.res_block2 = nn.ModuleList([
+            ResnetBlock(
+                in_channels=mid_channels,
+                out_channels=mid_channels,
+                temb_channels=0,
+                dropout=0.0) for _ in range(depth)
+        ])
+
+        self.conv_out = nn.Conv2d(
+            mid_channels,
+            out_channels,
+            kernel_size=1,
+        )
+
+    def forward(self, x):
+        x = self.conv_in(x)
+        for block in self.res_block1:
+            x = block(x, None)
+        x = torch.nn.functional.interpolate(
+            x,
+            size=(int(round(x.shape[2] * self.factor)),
+                  int(round(x.shape[3] * self.factor))))
+        x = self.attn(x)
+        for block in self.res_block2:
+            x = block(x, None)
+        x = self.conv_out(x)
+        return x
+
+
+class MergedRescaleEncoder(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 ch,
+                 resolution,
+                 out_ch,
+                 num_res_blocks,
+                 attn_resolutions,
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 ch_mult=(1, 2, 4, 8),
+                 rescale_factor=1.0,
+                 rescale_module_depth=1):
+        super().__init__()
+        intermediate_chn = ch * ch_mult[-1]
+        self.encoder = Encoder(
+            in_channels=in_channels,
+            num_res_blocks=num_res_blocks,
+            ch=ch,
+            ch_mult=ch_mult,
+            z_channels=intermediate_chn,
+            double_z=False,
+            resolution=resolution,
+            attn_resolutions=attn_resolutions,
+            dropout=dropout,
+            resamp_with_conv=resamp_with_conv,
+            out_ch=None)
+        self.rescaler = LatentRescaler(
+            factor=rescale_factor,
+            in_channels=intermediate_chn,
+            mid_channels=intermediate_chn,
+            out_channels=out_ch,
+            depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.encoder(x)
+        x = self.rescaler(x)
+        return x
+
+
+class MergedRescaleDecoder(nn.Module):
+
+    def __init__(self,
+                 z_channels,
+                 out_ch,
+                 resolution,
+                 num_res_blocks,
+                 attn_resolutions,
+                 ch,
+                 ch_mult=(1, 2, 4, 8),
+                 dropout=0.0,
+                 resamp_with_conv=True,
+                 rescale_factor=1.0,
+                 rescale_module_depth=1):
+        super().__init__()
+        tmp_chn = z_channels * ch_mult[-1]
+        self.decoder = Decoder(
+            out_ch=out_ch,
+            z_channels=tmp_chn,
+            attn_resolutions=attn_resolutions,
+            dropout=dropout,
+            resamp_with_conv=resamp_with_conv,
+            in_channels=None,
+            num_res_blocks=num_res_blocks,
+            ch_mult=ch_mult,
+            resolution=resolution,
+            ch=ch)
+        self.rescaler = LatentRescaler(
+            factor=rescale_factor,
+            in_channels=z_channels,
+            mid_channels=tmp_chn,
+            out_channels=tmp_chn,
+            depth=rescale_module_depth)
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Upsampler(nn.Module):
+
+    def __init__(self,
+                 in_size,
+                 out_size,
+                 in_channels,
+                 out_channels,
+                 ch_mult=2):
+        super().__init__()
+        assert out_size >= in_size
+        num_blocks = int(np.log2(out_size // in_size)) + 1
+        factor_up = 1. + (out_size % in_size)
+        print(
+            f'Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}'
+        )
+        self.rescaler = LatentRescaler(
+            factor=factor_up,
+            in_channels=in_channels,
+            mid_channels=2 * in_channels,
+            out_channels=in_channels)
+        self.decoder = Decoder(
+            out_ch=out_channels,
+            resolution=out_size,
+            z_channels=in_channels,
+            num_res_blocks=2,
+            attn_resolutions=[],
+            in_channels=None,
+            ch=in_channels,
+            ch_mult=[ch_mult for _ in range(num_blocks)])
+
+    def forward(self, x):
+        x = self.rescaler(x)
+        x = self.decoder(x)
+        return x
+
+
+class Resize(nn.Module):
+
+    def __init__(self, in_channels=None, learned=False, mode='bilinear'):
+        super().__init__()
+        self.with_conv = learned
+        self.mode = mode
+        if self.with_conv:
+            print(
+                f'Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode'
+            )
+            raise NotImplementedError()
+            assert in_channels is not None
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(
+                in_channels, in_channels, kernel_size=4, stride=2, padding=1)
+
+    def forward(self, x, scale_factor=1.0):
+        if scale_factor == 1.0:
+            return x
+        else:
+            x = torch.nn.functional.interpolate(
+                x,
+                mode=self.mode,
+                align_corners=False,
+                scale_factor=scale_factor)
+        return x
+
+
+class FirstStagePostProcessor(nn.Module):
+
+    def __init__(self,
+                 ch_mult: list,
+                 in_channels,
+                 pretrained_model: nn.Module = None,
+                 reshape=False,
+                 n_channels=None,
+                 dropout=0.,
+                 pretrained_config=None):
+        super().__init__()
+        if pretrained_config is None:
+            assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+            self.pretrained_model = pretrained_model
+        else:
+            assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None'
+            self.instantiate_pretrained(pretrained_config)
+
+        self.do_reshape = reshape
+
+        if n_channels is None:
+            n_channels = self.pretrained_model.encoder.ch
+
+        self.proj_norm = Normalize(in_channels, num_groups=in_channels // 2)
+        self.proj = nn.Conv2d(
+            in_channels, n_channels, kernel_size=3, stride=1, padding=1)
+
+        blocks = []
+        downs = []
+        ch_in = n_channels
+        for m in ch_mult:
+            blocks.append(
+                ResnetBlock(
+                    in_channels=ch_in,
+                    out_channels=m * n_channels,
+                    dropout=dropout))
+            ch_in = m * n_channels
+            downs.append(Downsample(ch_in, with_conv=False))
+
+        self.model = nn.ModuleList(blocks)
+        self.downsampler = nn.ModuleList(downs)
+
+    def instantiate_pretrained(self, config):
+        model = instantiate_from_config(config)
+        self.pretrained_model = model.eval()
+        # self.pretrained_model.train = False
+        for param in self.pretrained_model.parameters():
+            param.requires_grad = False
+
+    @torch.no_grad()
+    def encode_with_pretrained(self, x):
+        c = self.pretrained_model.encode(x)
+        if isinstance(c, DiagonalGaussianDistribution):
+            c = c.mode()
+        return c
+
+    def forward(self, x):
+        z_fs = self.encode_with_pretrained(x)
+        z = self.proj_norm(z_fs)
+        z = self.proj(z)
+        z = nonlinearity(z)
+
+        for submodel, downmodel in zip(self.model, self.downsampler):
+            z = submodel(z, temb=None)
+            z = downmodel(z)
+
+        if self.do_reshape:
+            z = rearrange(z, 'b c h w -> b (h w) c')
+        return z
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/openaimodel.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/openaimodel.py
new file mode 100644
index 000000000..5b6ac5fc8
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/openaimodel.py
@@ -0,0 +1,1018 @@
+import math
+from abc import abstractmethod
+from functools import partial
+from typing import Iterable
+
+import numpy as np
+import torch as th
+import torch.nn as nn
+import torch.nn.functional as F
+
+from modelscope.models.cv.image_to_3d.ldm.modules.attention import \
+    SpatialTransformer
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import (
+    avg_pool_nd, checkpoint, conv_nd, linear, normalization,
+    timestep_embedding, zero_module)
+from modelscope.models.cv.image_to_3d.ldm.util import exists
+
+
+# dummy replace
+def convert_module_to_f16(x):
+    pass
+
+
+def convert_module_to_f32(x):
+    pass
+
+
+# go
+class AttentionPool2d(nn.Module):
+    """
+    Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py
+    """
+
+    def __init__(
+        self,
+        spacial_dim: int,
+        embed_dim: int,
+        num_heads_channels: int,
+        output_dim: int = None,
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5)
+        self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1)
+        self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1)
+        self.num_heads = embed_dim // num_heads_channels
+        self.attention = QKVAttention(self.num_heads)
+
+    def forward(self, x):
+        b, c, *_spatial = x.shape
+        x = x.reshape(b, c, -1)  # NC(HW)
+        x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1)  # NC(HW+1)
+        x = x + self.positional_embedding[None, :, :].to(x.dtype)  # NC(HW+1)
+        x = self.qkv_proj(x)
+        x = self.attention(x)
+        x = self.c_proj(x)
+        return x[:, :, 0]
+
+
+class TimestepBlock(nn.Module):
+    """
+    Any module where forward() takes timestep embeddings as a second argument.
+    """
+
+    @abstractmethod
+    def forward(self, x, emb):
+        """
+        Apply the module to `x` given `emb` timestep embeddings.
+        """
+
+
+class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
+    """
+    A sequential module that passes timestep embeddings to the children that
+    support it as an extra input.
+    """
+
+    def forward(self, x, emb, context=None):
+        for layer in self:
+            if isinstance(layer, TimestepBlock):
+                x = layer(x, emb)
+            elif isinstance(layer, SpatialTransformer):
+                x = layer(x, context)
+            else:
+                x = layer(x)
+        return x
+
+
+class Upsample(nn.Module):
+    """
+    An upsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 upsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self,
+                 channels,
+                 use_conv,
+                 dims=2,
+                 out_channels=None,
+                 padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        if use_conv:
+            self.conv = conv_nd(
+                dims, self.channels, self.out_channels, 3, padding=padding)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        if self.dims == 3:
+            x = F.interpolate(
+                x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2),
+                mode='nearest')
+        else:
+            x = F.interpolate(x, scale_factor=2, mode='nearest')
+        if self.use_conv:
+            x = self.conv(x)
+        return x
+
+
+class TransposedUpsample(nn.Module):
+    'Learned 2x upsampling without padding'
+
+    def __init__(self, channels, out_channels=None, ks=5):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+
+        self.up = nn.ConvTranspose2d(
+            self.channels, self.out_channels, kernel_size=ks, stride=2)
+
+    def forward(self, x):
+        return self.up(x)
+
+
+class Downsample(nn.Module):
+    """
+    A downsampling layer with an optional convolution.
+    :param channels: channels in the inputs and outputs.
+    :param use_conv: a bool determining if a convolution is applied.
+    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
+                 downsampling occurs in the inner-two dimensions.
+    """
+
+    def __init__(self,
+                 channels,
+                 use_conv,
+                 dims=2,
+                 out_channels=None,
+                 padding=1):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.dims = dims
+        stride = 2 if dims != 3 else (1, 2, 2)
+        if use_conv:
+            self.op = conv_nd(
+                dims,
+                self.channels,
+                self.out_channels,
+                3,
+                stride=stride,
+                padding=padding)
+        else:
+            assert self.channels == self.out_channels
+            self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride)
+
+    def forward(self, x):
+        assert x.shape[1] == self.channels
+        return self.op(x)
+
+
+class ResBlock(TimestepBlock):
+    """
+    A residual block that can optionally change the number of channels.
+    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout.
+    :param out_channels: if specified, the number of out channels.
+    :param use_conv: if True and out_channels is specified, use a spatial
+        convolution instead of a smaller 1x1 convolution to change the
+        channels in the skip connection.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param use_checkpoint: if True, use gradient checkpointing on this module.
+    :param up: if True, use this block for upsampling.
+    :param down: if True, use this block for downsampling.
+    """
+
+    def __init__(
+        self,
+        channels,
+        emb_channels,
+        dropout,
+        out_channels=None,
+        use_conv=False,
+        use_scale_shift_norm=False,
+        dims=2,
+        use_checkpoint=False,
+        up=False,
+        down=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.emb_channels = emb_channels
+        self.dropout = dropout
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_checkpoint = use_checkpoint
+        self.use_scale_shift_norm = use_scale_shift_norm
+
+        self.in_layers = nn.Sequential(
+            normalization(channels),
+            nn.SiLU(),
+            conv_nd(dims, channels, self.out_channels, 3, padding=1),
+        )
+
+        self.updown = up or down
+
+        if up:
+            self.h_upd = Upsample(channels, False, dims)
+            self.x_upd = Upsample(channels, False, dims)
+        elif down:
+            self.h_upd = Downsample(channels, False, dims)
+            self.x_upd = Downsample(channels, False, dims)
+        else:
+            self.h_upd = self.x_upd = nn.Identity()
+
+        self.emb_layers = nn.Sequential(
+            nn.SiLU(),
+            linear(
+                emb_channels,
+                2 * self.out_channels
+                if use_scale_shift_norm else self.out_channels,
+            ),
+        )
+        self.out_layers = nn.Sequential(
+            normalization(self.out_channels),
+            nn.SiLU(),
+            nn.Dropout(p=dropout),
+            zero_module(
+                conv_nd(
+                    dims, self.out_channels, self.out_channels, 3, padding=1)),
+        )
+
+        if self.out_channels == channels:
+            self.skip_connection = nn.Identity()
+        elif use_conv:
+            self.skip_connection = conv_nd(
+                dims, channels, self.out_channels, 3, padding=1)
+        else:
+            self.skip_connection = conv_nd(dims, channels, self.out_channels,
+                                           1)
+
+    def forward(self, x, emb):
+        """
+        Apply the block to a Tensor, conditioned on a timestep embedding.
+        :param x: an [N x C x ...] Tensor of features.
+        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        return checkpoint(self._forward, (x, emb), self.parameters(),
+                          self.use_checkpoint)
+
+    def _forward(self, x, emb):
+        if self.updown:
+            in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
+            h = in_rest(x)
+            h = self.h_upd(h)
+            x = self.x_upd(x)
+            h = in_conv(h)
+        else:
+            h = self.in_layers(x)
+        emb_out = self.emb_layers(emb).type(h.dtype)
+        while len(emb_out.shape) < len(h.shape):
+            emb_out = emb_out[..., None]
+        if self.use_scale_shift_norm:  # False
+            out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
+            scale, shift = th.chunk(emb_out, 2, dim=1)
+            h = out_norm(h) * (1 + scale) + shift
+            h = out_rest(h)
+        else:
+            h = h + emb_out
+            h = self.out_layers(h)
+        return self.skip_connection(x) + h
+
+
+class AttentionBlock(nn.Module):
+    """
+    An attention block that allows spatial positions to attend to each other.
+    Originally ported from here, but adapted to the N-d case.
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
+    """
+
+    def __init__(
+        self,
+        channels,
+        num_heads=1,
+        num_head_channels=-1,
+        use_checkpoint=False,
+        use_new_attention_order=False,
+    ):
+        super().__init__()
+        self.channels = channels
+        if num_head_channels == -1:
+            self.num_heads = num_heads
+        else:
+            assert (
+                channels % num_head_channels == 0
+            ), f'q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}'
+            self.num_heads = channels // num_head_channels
+        self.use_checkpoint = use_checkpoint
+        self.norm = normalization(channels)
+        self.qkv = conv_nd(1, channels, channels * 3, 1)
+        if use_new_attention_order:
+            # split qkv before split heads
+            self.attention = QKVAttention(self.num_heads)
+        else:
+            # split heads before split qkv
+            self.attention = QKVAttentionLegacy(self.num_heads)
+
+        self.proj_out = zero_module(conv_nd(1, channels, channels, 1))
+
+    def forward(self, x):
+        return checkpoint(
+            self._forward, (x, ), self.parameters(), True
+        )  # TODO: check checkpoint usage, is True # TODO: fix the .half call!!!
+        # return pt_checkpoint(self._forward, x)  # pytorch
+
+    def _forward(self, x):
+        b, c, *spatial = x.shape
+        x = x.reshape(b, c, -1)
+        qkv = self.qkv(self.norm(x))
+        h = self.attention(qkv)
+        h = self.proj_out(h)
+        return (x + h).reshape(b, c, *spatial)
+
+
+def count_flops_attn(model, _x, y):
+    """
+    A counter for the `thop` package to count the operations in an
+    attention operation.
+    Meant to be used like:
+        macs, params = thop.profile(
+            model,
+            inputs=(inputs, timestamps),
+            custom_ops={QKVAttention: QKVAttention.count_flops},
+        )
+    """
+    b, c, *spatial = y[0].shape
+    num_spatial = int(np.prod(spatial))
+    # We perform two matmuls with the same number of ops.
+    # The first computes the weight matrix, the second computes
+    # the combination of the value vectors.
+    matmul_ops = 2 * b * (num_spatial**2) * c
+    model.total_ops += th.DoubleTensor([matmul_ops])
+
+
+class QKVAttentionLegacy(nn.Module):
+    """
+    A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(
+            ch, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            'bct,bcs->bts', q * scale,
+            k * scale)  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum('bts,bcs->bct', weight, v)
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class QKVAttention(nn.Module):
+    """
+    A module which performs QKV attention and splits in a different order.
+    """
+
+    def __init__(self, n_heads):
+        super().__init__()
+        self.n_heads = n_heads
+
+    def forward(self, qkv):
+        """
+        Apply QKV attention.
+        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        :return: an [N x (H * C) x T] tensor after attention.
+        """
+        bs, width, length = qkv.shape
+        assert width % (3 * self.n_heads) == 0
+        ch = width // (3 * self.n_heads)
+        q, k, v = qkv.chunk(3, dim=1)
+        scale = 1 / math.sqrt(math.sqrt(ch))
+        weight = th.einsum(
+            'bct,bcs->bts',
+            (q * scale).view(bs * self.n_heads, ch, length),
+            (k * scale).view(bs * self.n_heads, ch, length),
+        )  # More stable with f16 than dividing afterwards
+        weight = th.softmax(weight.float(), dim=-1).type(weight.dtype)
+        a = th.einsum('bts,bcs->bct', weight,
+                      v.reshape(bs * self.n_heads, ch, length))
+        return a.reshape(bs, -1, length)
+
+    @staticmethod
+    def count_flops(model, _x, y):
+        return count_flops_attn(model, _x, y)
+
+
+class UNetModel(nn.Module):
+    """
+    The full UNet model with attention and timestep embedding.
+    :param in_channels: channels in the input Tensor.
+    :param model_channels: base channel count for the model.
+    :param out_channels: channels in the output Tensor.
+    :param num_res_blocks: number of residual blocks per downsample.
+    :param attention_resolutions: a collection of downsample rates at which
+        attention will take place. May be a set, list, or tuple.
+        For example, if this contains 4, then at 4x downsampling, attention
+        will be used.
+    :param dropout: the dropout probability.
+    :param channel_mult: channel multiplier for each level of the UNet.
+    :param conv_resample: if True, use learned convolutions for upsampling and
+        downsampling.
+    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param num_classes: if specified (as an int), then this model will be
+        class-conditional with `num_classes` classes.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param num_heads: the number of attention heads in each attention layer.
+    :param num_heads_channels: if specified, ignore num_heads and instead use
+                               a fixed channel width per attention head.
+    :param num_heads_upsample: works with num_heads to set a different number
+                               of heads for upsampling. Deprecated.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param resblock_updown: use residual blocks for up/downsampling.
+    :param use_new_attention_order: use a different attention pattern for potentially
+                                    increased efficiency.
+    """
+
+    def __init__(
+            self,
+            image_size,
+            in_channels,
+            model_channels,
+            out_channels,
+            num_res_blocks,
+            attention_resolutions,
+            dropout=0,
+            channel_mult=(1, 2, 4, 8),
+            conv_resample=True,
+            dims=2,
+            num_classes=None,
+            use_checkpoint=False,
+            use_fp16=False,
+            num_heads=-1,
+            num_head_channels=-1,
+            num_heads_upsample=-1,
+            use_scale_shift_norm=False,
+            resblock_updown=False,
+            use_new_attention_order=False,
+            use_spatial_transformer=False,  # custom transformer support
+            transformer_depth=1,  # custom transformer support
+            context_dim=None,  # custom transformer support
+            n_embed=None,  # custom support for prediction of discrete ids into codebook of first stage vq model
+            legacy=True,
+            disable_self_attentions=None,
+            num_attention_blocks=None):
+        super().__init__()
+        if use_spatial_transformer:
+            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your \
+            cross-attention conditioning...'
+
+        if context_dim is not None:
+            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your \
+            cross-attention conditioning...'
+
+            from omegaconf.listconfig import ListConfig
+            if type(context_dim) == ListConfig:
+                context_dim = list(context_dim)
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        if num_heads == -1:
+            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
+
+        if num_head_channels == -1:
+            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
+
+        self.image_size = image_size
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        if isinstance(num_res_blocks, int):
+            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
+        else:
+            if len(num_res_blocks) != len(channel_mult):
+                raise ValueError(
+                    'provide num_res_blocks either as an int (globally constant) or '
+                    'as a list/tuple (per-level) with the same length as channel_mult'
+                )
+            self.num_res_blocks = num_res_blocks
+        # self.num_res_blocks = num_res_blocks
+        if disable_self_attentions is not None:
+            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
+            assert len(disable_self_attentions) == len(channel_mult)
+        if num_attention_blocks is not None:
+            assert len(num_attention_blocks) == len(self.num_res_blocks)
+            assert all(
+                map(
+                    lambda i: self.num_res_blocks[i] >= num_attention_blocks[i
+                                                                             ],
+                    range(len(num_attention_blocks))))
+            print(
+                f'Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. '
+                f'This option has LESS priority than attention_resolutions {attention_resolutions}, '
+                f'i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, '
+                f'attention will still not be set.'
+            )  # todo: convert to warning
+
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.num_classes = num_classes
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+        self.predict_codebook_ids = n_embed is not None
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        if self.num_classes is not None:
+            self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+
+        self.input_blocks = nn.ModuleList([
+            TimestepEmbedSequential(
+                conv_nd(dims, in_channels, model_channels, 3, padding=1))
+        ])  # 0
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for nr in range(self.num_res_blocks[level]):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:  # always True
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks
+                                  ) or nr < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth,
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa))
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        ) if resblock_updown else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        if num_head_channels == -1:
+            dim_head = ch // num_heads
+        else:
+            num_heads = ch // num_head_channels
+            dim_head = num_head_channels
+        if legacy:
+            # num_heads = 1
+            dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=dim_head,
+                use_new_attention_order=use_new_attention_order,
+            ) if not use_spatial_transformer else
+            SpatialTransformer(  # always uses a self-attn
+                ch,
+                num_heads,
+                dim_head,
+                depth=transformer_depth,
+                context_dim=context_dim),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+
+        self.output_blocks = nn.ModuleList([])
+        for level, mult in list(enumerate(channel_mult))[::-1]:
+            for i in range(self.num_res_blocks[level] + 1):
+                ich = input_block_chans.pop()
+                layers = [
+                    ResBlock(
+                        ch + ich,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=model_channels * mult,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = model_channels * mult
+                if ds in attention_resolutions:
+                    if num_head_channels == -1:
+                        dim_head = ch // num_heads
+                    else:
+                        num_heads = ch // num_head_channels
+                        dim_head = num_head_channels
+                    if legacy:
+                        # num_heads = 1
+                        dim_head = ch // num_heads if use_spatial_transformer else num_head_channels
+                    if exists(disable_self_attentions):
+                        disabled_sa = disable_self_attentions[level]
+                    else:
+                        disabled_sa = False
+
+                    if not exists(num_attention_blocks
+                                  ) or i < num_attention_blocks[level]:
+                        layers.append(
+                            AttentionBlock(
+                                ch,
+                                use_checkpoint=use_checkpoint,
+                                num_heads=num_heads_upsample,
+                                num_head_channels=dim_head,
+                                use_new_attention_order=use_new_attention_order,
+                            ) if not use_spatial_transformer else
+                            SpatialTransformer(
+                                ch,
+                                num_heads,
+                                dim_head,
+                                depth=transformer_depth,
+                                context_dim=context_dim,
+                                disable_self_attn=disabled_sa))
+                if level and i == self.num_res_blocks[level]:
+                    out_ch = ch
+                    layers.append(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            up=True,
+                        ) if resblock_updown else Upsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch))
+                    ds //= 2
+                self.output_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+
+        self.out = nn.Sequential(
+            normalization(ch),
+            nn.SiLU(),
+            zero_module(
+                conv_nd(dims, model_channels, out_channels, 3, padding=1)),
+        )
+        if self.predict_codebook_ids:
+            self.id_predictor = nn.Sequential(
+                normalization(ch),
+                conv_nd(dims, model_channels, n_embed, 1),
+                # nn.LogSoftmax(dim=1)  # change to cross_entropy and produce non-normalized logits
+            )
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+        self.output_blocks.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+        self.output_blocks.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :param context: conditioning plugged in via crossattn
+        :param y: an [N] Tensor of labels, if class-conditional.
+        :return: an [N x C x ...] Tensor of outputs.
+        """
+        assert (y is not None) == (
+            self.num_classes is not None
+        ), 'must specify y if and only if the model is class-conditional'
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, self.model_channels, repeat_only=False)  # N
+        emb = self.time_embed(t_emb)  #
+
+        if self.num_classes is not None:
+            assert y.shape == (x.shape[0], )
+            emb = emb + self.label_emb(y)
+
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb, context)  # conv
+            hs.append(h)
+        h = self.middle_block(h, emb, context)
+        for module in self.output_blocks:
+            h = th.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+        h = h.type(x.dtype)
+        if self.predict_codebook_ids:
+            return self.id_predictor(h)
+        else:
+            return self.out(h)
+
+
+class EncoderUNetModel(nn.Module):
+    """
+    The half UNet model with attention and timestep embedding.
+    For usage, see UNet.
+    """
+
+    def __init__(self,
+                 image_size,
+                 in_channels,
+                 model_channels,
+                 out_channels,
+                 num_res_blocks,
+                 attention_resolutions,
+                 dropout=0,
+                 channel_mult=(1, 2, 4, 8),
+                 conv_resample=True,
+                 dims=2,
+                 use_checkpoint=False,
+                 use_fp16=False,
+                 num_heads=1,
+                 num_head_channels=-1,
+                 num_heads_upsample=-1,
+                 use_scale_shift_norm=False,
+                 resblock_updown=False,
+                 use_new_attention_order=False,
+                 pool='adaptive',
+                 *args,
+                 **kwargs):
+        super().__init__()
+
+        if num_heads_upsample == -1:
+            num_heads_upsample = num_heads
+
+        self.in_channels = in_channels
+        self.model_channels = model_channels
+        self.out_channels = out_channels
+        self.num_res_blocks = num_res_blocks
+        self.attention_resolutions = attention_resolutions
+        self.dropout = dropout
+        self.channel_mult = channel_mult
+        self.conv_resample = conv_resample
+        self.use_checkpoint = use_checkpoint
+        self.dtype = th.float16 if use_fp16 else th.float32
+        self.num_heads = num_heads
+        self.num_head_channels = num_head_channels
+        self.num_heads_upsample = num_heads_upsample
+
+        time_embed_dim = model_channels * 4
+        self.time_embed = nn.Sequential(
+            linear(model_channels, time_embed_dim),
+            nn.SiLU(),
+            linear(time_embed_dim, time_embed_dim),
+        )
+
+        self.input_blocks = nn.ModuleList([
+            TimestepEmbedSequential(
+                conv_nd(dims, in_channels, model_channels, 3, padding=1))
+        ])
+        self._feature_size = model_channels
+        input_block_chans = [model_channels]
+        ch = model_channels
+        ds = 1
+        for level, mult in enumerate(channel_mult):
+            for _ in range(num_res_blocks):
+                layers = [
+                    ResBlock(
+                        ch,
+                        time_embed_dim,
+                        dropout,
+                        out_channels=mult * model_channels,
+                        dims=dims,
+                        use_checkpoint=use_checkpoint,
+                        use_scale_shift_norm=use_scale_shift_norm,
+                    )
+                ]
+                ch = mult * model_channels
+                if ds in attention_resolutions:
+                    layers.append(
+                        AttentionBlock(
+                            ch,
+                            use_checkpoint=use_checkpoint,
+                            num_heads=num_heads,
+                            num_head_channels=num_head_channels,
+                            use_new_attention_order=use_new_attention_order,
+                        ))
+                self.input_blocks.append(TimestepEmbedSequential(*layers))
+                self._feature_size += ch
+                input_block_chans.append(ch)
+            if level != len(channel_mult) - 1:
+                out_ch = ch
+                self.input_blocks.append(
+                    TimestepEmbedSequential(
+                        ResBlock(
+                            ch,
+                            time_embed_dim,
+                            dropout,
+                            out_channels=out_ch,
+                            dims=dims,
+                            use_checkpoint=use_checkpoint,
+                            use_scale_shift_norm=use_scale_shift_norm,
+                            down=True,
+                        ) if resblock_updown else Downsample(
+                            ch, conv_resample, dims=dims, out_channels=out_ch))
+                )
+                ch = out_ch
+                input_block_chans.append(ch)
+                ds *= 2
+                self._feature_size += ch
+
+        self.middle_block = TimestepEmbedSequential(
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+            AttentionBlock(
+                ch,
+                use_checkpoint=use_checkpoint,
+                num_heads=num_heads,
+                num_head_channels=num_head_channels,
+                use_new_attention_order=use_new_attention_order,
+            ),
+            ResBlock(
+                ch,
+                time_embed_dim,
+                dropout,
+                dims=dims,
+                use_checkpoint=use_checkpoint,
+                use_scale_shift_norm=use_scale_shift_norm,
+            ),
+        )
+        self._feature_size += ch
+        self.pool = pool
+        if pool == 'adaptive':
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                nn.AdaptiveAvgPool2d((1, 1)),
+                zero_module(conv_nd(dims, ch, out_channels, 1)),
+                nn.Flatten(),
+            )
+        elif pool == 'attention':
+            assert num_head_channels != -1
+            self.out = nn.Sequential(
+                normalization(ch),
+                nn.SiLU(),
+                AttentionPool2d((image_size // ds), ch, num_head_channels,
+                                out_channels),
+            )
+        elif pool == 'spatial':
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                nn.ReLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        elif pool == 'spatial_v2':
+            self.out = nn.Sequential(
+                nn.Linear(self._feature_size, 2048),
+                normalization(2048),
+                nn.SiLU(),
+                nn.Linear(2048, self.out_channels),
+            )
+        else:
+            raise NotImplementedError(f'Unexpected {pool} pooling')
+
+    def convert_to_fp16(self):
+        """
+        Convert the torso of the model to float16.
+        """
+        self.input_blocks.apply(convert_module_to_f16)
+        self.middle_block.apply(convert_module_to_f16)
+
+    def convert_to_fp32(self):
+        """
+        Convert the torso of the model to float32.
+        """
+        self.input_blocks.apply(convert_module_to_f32)
+        self.middle_block.apply(convert_module_to_f32)
+
+    def forward(self, x, timesteps):
+        """
+        Apply the model to an input batch.
+        :param x: an [N x C x ...] Tensor of inputs.
+        :param timesteps: a 1-D batch of timesteps.
+        :return: an [N x K] Tensor of outputs.
+        """
+        emb = self.time_embed(
+            timestep_embedding(timesteps, self.model_channels))
+
+        results = []
+        h = x.type(self.dtype)
+        for module in self.input_blocks:
+            h = module(h, emb)
+            if self.pool.startswith('spatial'):
+                results.append(h.type(x.dtype).mean(dim=(2, 3)))
+        h = self.middle_block(h, emb)
+        if self.pool.startswith('spatial'):
+            results.append(h.type(x.dtype).mean(dim=(2, 3)))
+            h = th.cat(results, axis=-1)
+            return self.out(h)
+        else:
+            h = h.type(x.dtype)
+            return self.out(h)
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/util.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/util.py
new file mode 100644
index 000000000..a63d05a3c
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/util.py
@@ -0,0 +1,307 @@
+# adopted from
+# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+# and
+# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+# and
+# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+#
+# thanks!
+
+import math
+import os
+
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import repeat
+
+from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config
+
+
+def make_beta_schedule(schedule,
+                       n_timestep,
+                       linear_start=1e-4,
+                       linear_end=2e-2,
+                       cosine_s=8e-3):
+    if schedule == 'linear':
+        betas = (
+            torch.linspace(
+                linear_start**0.5,
+                linear_end**0.5,
+                n_timestep,
+                dtype=torch.float64)**2)
+
+    elif schedule == 'cosine':
+        timesteps = (
+            torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep
+            + cosine_s)
+        alphas = timesteps / (1 + cosine_s) * np.pi / 2
+        alphas = torch.cos(alphas).pow(2)
+        alphas = alphas / alphas[0]
+        betas = 1 - alphas[1:] / alphas[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+
+    elif schedule == 'sqrt_linear':
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64)
+    elif schedule == 'sqrt':
+        betas = torch.linspace(
+            linear_start, linear_end, n_timestep, dtype=torch.float64)**0.5
+    else:
+        raise ValueError(f"schedule '{schedule}' unknown.")
+    return betas.numpy()
+
+
+def make_ddim_timesteps(ddim_discr_method,
+                        num_ddim_timesteps,
+                        num_ddpm_timesteps,
+                        verbose=True):
+    if ddim_discr_method == 'uniform':
+        c = num_ddpm_timesteps // num_ddim_timesteps
+        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
+    elif ddim_discr_method == 'quad':
+        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8),
+                                       num_ddim_timesteps))**2).astype(int)
+    else:
+        raise NotImplementedError(
+            f'There is no ddim discretization method called "{ddim_discr_method}"'
+        )
+
+    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
+    # add one to get the final alpha values right (the ones from first scale to data during sampling)
+    steps_out = ddim_timesteps + 1
+    if verbose:
+        print(f'Selected timesteps for ddim sampler: {steps_out}')
+    return steps_out
+
+
+def make_ddim_sampling_parameters(alphacums,
+                                  ddim_timesteps,
+                                  eta,
+                                  verbose=True):
+    # select alphas for computing the variance schedule
+    alphas = alphacums[ddim_timesteps]
+    alphas_prev = np.asarray([alphacums[0]]
+                             + alphacums[ddim_timesteps[:-1]].tolist())
+
+    # according the the formula provided in https://arxiv.org/abs/2010.02502
+    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) *  # noqa
+                           (1 - alphas / alphas_prev))  # noqa
+    if verbose:
+        print(
+            f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}'
+        )
+        print(
+            f'For the chosen value of eta, which is {eta}, '
+            f'this results in the following sigma_t schedule for ddim sampler {sigmas}'
+        )
+    return sigmas, alphas, alphas_prev
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1, ) * (len(x_shape) - 1)))
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [
+            x.detach().requires_grad_(True) for x in ctx.input_tensors
+        ]
+        with torch.enable_grad():
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    else:
+        embedding = repeat(timesteps, 'b -> b d', d=dim)
+    return embedding
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+
+    def forward(self, x):
+        return super().forward(x.float()).type(x.dtype)
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f'unsupported dimensions: {dims}')
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f'unsupported dimensions: {dims}')
+
+
+class HybridConditioner(nn.Module):
+
+    def __init__(self, c_concat_config, c_crossattn_config):
+        super().__init__()
+        self.concat_conditioner = instantiate_from_config(c_concat_config)
+        self.crossattn_conditioner = instantiate_from_config(
+            c_crossattn_config)
+
+    def forward(self, c_concat, c_crossattn):
+        c_concat = self.concat_conditioner(c_concat)
+        c_crossattn = self.crossattn_conditioner(c_crossattn)
+        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
+
+
+def noise_like(shape, device, repeat=False):
+
+    def repeat_noise():
+        return torch.randn((1, *shape[1:]),
+                           device=device).repeat(shape[0],
+                                                 *((1, ) * (len(shape) - 1)))
+
+    def noise():
+        return torch.randn(shape, device=device)
+
+    return repeat_noise() if repeat else noise()
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/distributions/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/distributions/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/distributions/distributions.py b/modelscope/models/cv/image_to_3d/ldm/modules/distributions/distributions.py
new file mode 100644
index 000000000..24cbbbc89
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/distributions/distributions.py
@@ -0,0 +1,95 @@
+import numpy as np
+import torch
+
+
+class AbstractDistribution:
+
+    def sample(self):
+        raise NotImplementedError()
+
+    def mode(self):
+        raise NotImplementedError()
+
+
+class DiracDistribution(AbstractDistribution):
+
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution(object):
+
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(
+                self.mean).to(device=self.parameters.device)
+
+    def sample(self):
+        x = self.mean + self.std * torch.randn(
+            self.mean.shape).to(device=self.parameters.device)
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar
+            + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+
+    def mode(self):
+        return self.mean
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    (source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/
+    guided_diffusion/losses.py#L12)
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, 'at least one argument must be a Tensor'
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    return 0.5 * (
+        -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) +  # noqa
+        ((mean1 - mean2)**2) * torch.exp(-logvar2))  # noqa
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/__init__.py
new file mode 100644
index 000000000..dcc561953
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/__init__.py
@@ -0,0 +1 @@
+from .clip import *
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/clip.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/clip.py
new file mode 100644
index 000000000..413452498
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/clip.py
@@ -0,0 +1,251 @@
+import hashlib
+import os
+import urllib
+import warnings
+from typing import Any, List, Union
+
+import packaging
+import torch
+from PIL import Image
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
+from tqdm import tqdm
+
+from modelscope.models.cv.image_to_3d.ldm.modules.encoders.clip.model import \
+    build_model
+
+try:
+    from torchvision.transforms import InterpolationMode
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+if packaging.version.parse(
+        torch.__version__) < packaging.version.parse('1.7.1'):
+    warnings.warn('PyTorch version 1.7.1 or higher is recommended')
+
+__all__ = ['available_models', 'load']
+
+_MODELS = {
+    'RN50':
+    'https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/\
+    RN50.pt',
+    'RN101':
+    'https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/\
+    RN101.pt',
+    'RN50x4':
+    'https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/\
+    RN50x4.pt',
+    'RN50x16':
+    'https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/\
+    RN50x16.pt',
+    'RN50x64':
+    'https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/\
+    RN50x64.pt',
+    'ViT-B/32':
+    'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/\
+    ViT-B-32.pt',
+    'ViT-B/16':
+    'https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/\
+    ViT-B-16.pt',
+    'ViT-L/14':
+    'https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/\
+    ViT-L-14.pt',
+    'ViT-L/14@336px':
+    'https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/\
+    ViT-L-14-336px.pt',
+}
+
+
+def _download(url: str, root: str):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split('/')[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(
+            f'{download_target} exists and is not a regular file')
+
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target,
+                               'rb').read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(
+                f'{download_target} exists, but the SHA256 checksum does not match; re-downloading the file'
+            )
+
+    with urllib.request.urlopen(url) as source, open(download_target,
+                                                     'wb') as output:
+        with tqdm(
+                total=int(source.info().get('Content-Length')),
+                ncols=80,
+                unit='iB',
+                unit_scale=True,
+                unit_divisor=1024) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    if hashlib.sha256(open(download_target,
+                           'rb').read()).hexdigest() != expected_sha256:
+        raise RuntimeError(
+            'Model has been downloaded but the SHA256 checksum does not not match'
+        )
+
+    return download_target
+
+
+def _convert_image_to_rgb(image):
+    return image.convert('RGB')
+
+
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=BICUBIC),
+        CenterCrop(n_px),
+        _convert_image_to_rgb,
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+
+
+def load(name: str,
+         device: Union[str, torch.device] = 'cuda'
+         if torch.cuda.is_available() else 'cpu',
+         jit: bool = False,
+         download_root: str = None):
+    """Load a CLIP model
+
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+
+    device : Union[str, torch.device]
+        The device to put the loaded model
+
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+
+    download_root: str
+        path to download the model files; by default, it uses "~/.cache/clip"
+
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(
+            _MODELS[name], download_root
+            or os.path.expanduser('~/.cache/clip'))
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(
+            f'Model {name} not found; available models = {available_models()}')
+
+    with open(model_path, 'rb') as opened_file:
+        try:
+            # loading JIT archive
+            model = torch.jit.load(
+                opened_file, map_location=device if jit else 'cpu').eval()
+            state_dict = None
+        except RuntimeError:
+            # loading saved state dict
+            if jit:
+                warnings.warn(
+                    f'File {model_path} is not a JIT archive. Loading as a state dict instead'
+                )
+                jit = False
+            state_dict = torch.load(opened_file, map_location='cpu')
+
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == 'cpu':
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def _node_get(node: torch._C.Node, key: str):
+        """Gets attributes of a node which is polymorphic over return type.
+
+        From https://github.com/pytorch/pytorch/pull/82628
+        """
+        sel = node.kindOf(key)
+        return getattr(node, sel)(key)
+
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+        except RuntimeError:
+            graphs = []
+
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        _node_get(node, 'value')).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, 'graph') else []
+            except RuntimeError:
+                graphs = []
+
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [
+                            1, 2
+                    ]:  # dtype can be the second or third argument to aten::to()
+                        if _node_get(inputs[i].node(), 'value') == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, _transform(model.input_resolution.item())
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/model.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/model.py
new file mode 100644
index 000000000..c3d0471f5
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/model.py
@@ -0,0 +1,511 @@
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu3 = nn.ReLU(inplace=True)
+
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu1(self.bn1(self.conv1(x)))
+        out = self.relu2(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu3(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1],
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        return x.squeeze(0)
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+
+
+class VisionTransformer(nn.Module):
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        torch_zeros = torch.zeros(
+            x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device)
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch_zeros, x],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+class CLIP(nn.Module):
+
+    def __init__(
+            self,
+            embed_dim: int,
+            # vision
+            image_resolution: int,
+            vision_layers: Union[Tuple[int, int, int, int], int],
+            vision_width: int,
+            vision_patch_size: int,
+            # text
+            context_length: int,
+            vocab_size: int,
+            transformer_width: int,
+            transformer_heads: int,
+            transformer_layers: int):
+        super().__init__()
+
+        self.context_length = context_length
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim)
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features**-0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [
+                    self.visual.layer1, self.visual.layer2, self.visual.layer3,
+                    self.visual.layer4
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith('bn3.weight'):
+                        nn.init.zeros_(param)
+
+        proj_std = (self.transformer.width**-0.5) * (
+            (2 * self.transformer.layers)**-0.5)
+        attn_std = self.transformer.width**-0.5
+        fc_std = (2 * self.transformer.width)**-0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(
+                self.text_projection, std=self.transformer.width**-0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(
+            self.dtype)  # [batch_size, n_ctx, d_model]
+
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(
+            dim=1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logits_per_image.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(layer):
+        if isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            layer.weight.data = layer.weight.data.half()
+            if layer.bias is not None:
+                layer.bias.data = layer.bias.data.half()
+
+        if isinstance(layer, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(layer, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(layer, name):
+                attr = getattr(layer, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model(state_dict: dict):
+    vit = 'visual.proj' in state_dict
+
+    if vit:
+        vision_width = state_dict['visual.conv1.weight'].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith('visual.') and k.endswith('.attn.in_proj_weight')
+        ])
+        vision_patch_size = state_dict['visual.conv1.weight'].shape[-1]
+        grid_size = round(
+            (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len(
+                set(
+                    k.split('.')[2] for k in state_dict
+                    if k.startswith(f'visual.layer{b}')))
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0]
+        output_width = round(
+            (state_dict['visual.attnpool.positional_embedding'].shape[0]
+             - 1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            'visual.attnpool.positional_embedding'].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict['text_projection'].shape[1]
+    context_length = state_dict['positional_embedding'].shape[0]
+    vocab_size = state_dict['token_embedding.weight'].shape[0]
+    transformer_width = state_dict['ln_final.weight'].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split('.')[2] for k in state_dict
+            if k.startswith('transformer.resblocks')))
+
+    model = CLIP(embed_dim, image_resolution, vision_layers, vision_width,
+                 vision_patch_size, context_length, vocab_size,
+                 transformer_width, transformer_heads, transformer_layers)
+
+    for key in ['input_resolution', 'context_length', 'vocab_size']:
+        if key in state_dict:
+            del state_dict[key]
+
+    convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/simple_tokenizer.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/simple_tokenizer.py
new file mode 100644
index 000000000..ffd0d0928
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/simple_tokenizer.py
@@ -0,0 +1,149 @@
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except BaseException:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/modules.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/modules.py
new file mode 100644
index 000000000..d8fbc03d9
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/modules.py
@@ -0,0 +1,704 @@
+import random
+from functools import partial
+
+import kornia
+import kornia.augmentation as K
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision import transforms
+from transformers import (CLIPTextModel, CLIPTokenizer, CLIPVisionModel,
+                          T5EncoderModel, T5Tokenizer)
+
+from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import (
+    extract_into_tensor, make_beta_schedule, noise_like)
+# import clip
+from modelscope.models.cv.image_to_3d.ldm.modules.encoders import clip
+# TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test
+from modelscope.models.cv.image_to_3d.ldm.modules.x_transformer import (
+    Encoder, TransformerWrapper)
+from modelscope.models.cv.image_to_3d.ldm.thirdp.psp.id_loss import IDFeatures
+from modelscope.models.cv.image_to_3d.ldm.util import (default,
+                                                       instantiate_from_config)
+
+
+class AbstractEncoder(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def encode(self, *args, **kwargs):
+        raise NotImplementedError
+
+
+class IdentityEncoder(AbstractEncoder):
+
+    def encode(self, x):
+        return x
+
+
+class FaceClipEncoder(AbstractEncoder):
+
+    def __init__(self, augment=True, retreival_key=None):
+        super().__init__()
+        self.encoder = FrozenCLIPImageEmbedder()
+        self.augment = augment
+        self.retreival_key = retreival_key
+
+    def forward(self, img):
+        encodings = []
+        with torch.no_grad():
+            x_offset = 125
+            if self.retreival_key:
+                # Assumes retrieved image are packed into the second half of channels
+                face = img[:, 3:, 190:440, x_offset:(512 - x_offset)]
+                other = img[:, :3, ...].clone()
+            else:
+                face = img[:, :, 190:440, x_offset:(512 - x_offset)]
+                other = img.clone()
+
+            if self.augment:
+                face = K.RandomHorizontalFlip()(face)
+
+            other[:, :, 190:440, x_offset:(512 - x_offset)] *= 0
+            encodings = [
+                self.encoder.encode(face),
+                self.encoder.encode(other),
+            ]
+
+        return torch.cat(encodings, dim=1)
+
+    def encode(self, img):
+        if isinstance(img, list):
+            # Uncondition
+            return torch.zeros(
+                (1, 2, 768),
+                device=self.encoder.model.visual.conv1.weight.device)
+
+        return self(img)
+
+
+class FaceIdClipEncoder(AbstractEncoder):
+
+    def __init__(self):
+        super().__init__()
+        self.encoder = FrozenCLIPImageEmbedder()
+        for p in self.encoder.parameters():
+            p.requires_grad = False
+        self.id = FrozenFaceEncoder(
+            '/home/jpinkney/code/stable-diffusion/model_ir_se50.pth',
+            augment=True)
+
+    def forward(self, img):
+        encodings = []
+        with torch.no_grad():
+            face = kornia.geometry.resize(
+                img, (256, 256), interpolation='bilinear', align_corners=True)
+
+            other = img.clone()
+            other[:, :, 184:452, 122:396] *= 0
+            encodings = [
+                self.id.encode(face),
+                self.encoder.encode(other),
+            ]
+
+        return torch.cat(encodings, dim=1)
+
+    def encode(self, img):
+        if isinstance(img, list):
+            # Uncondition
+            return torch.zeros(
+                (1, 2, 768),
+                device=self.encoder.model.visual.conv1.weight.device)
+
+        return self(img)
+
+
+class ClassEmbedder(nn.Module):
+
+    def __init__(self, embed_dim, n_classes=1000, key='class'):
+        super().__init__()
+        self.key = key
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+
+    def forward(self, batch, key=None):
+        if key is None:
+            key = self.key
+        # this is for use in crossattn
+        c = batch[key][:, None]
+        c = self.embedding(c)
+        return c
+
+
+class TransformerEmbedder(AbstractEncoder):
+    """Some transformer encoder layers"""
+
+    def __init__(self,
+                 n_embed,
+                 n_layer,
+                 vocab_size,
+                 max_seq_len=77,
+                 device='cuda'):
+        super().__init__()
+        self.device = device
+        self.transformer = TransformerWrapper(
+            num_tokens=vocab_size,
+            max_seq_len=max_seq_len,
+            attn_layers=Encoder(dim=n_embed, depth=n_layer))
+
+    def forward(self, tokens):
+        tokens = tokens.to(self.device)  # meh
+        z = self.transformer(tokens, return_embeddings=True)
+        return z
+
+    def encode(self, x):
+        return self(x)
+
+
+class BERTTokenizer(AbstractEncoder):
+    """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)"""
+
+    def __init__(self, device='cuda', vq_interface=True, max_length=77):
+        super().__init__()
+        from transformers import BertTokenizerFast  # TODO: add to reuquirements
+        self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
+        self.device = device
+        self.vq_interface = vq_interface
+        self.max_length = max_length
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt')
+        tokens = batch_encoding['input_ids'].to(self.device)
+        return tokens
+
+    @torch.no_grad()
+    def encode(self, text):
+        tokens = self(text)
+        if not self.vq_interface:
+            return tokens
+        return None, None, [None, None, tokens]
+
+    def decode(self, text):
+        return text
+
+
+class BERTEmbedder(AbstractEncoder):
+    """Uses the BERT tokenizr model and add some transformer encoder layers"""
+
+    def __init__(self,
+                 n_embed,
+                 n_layer,
+                 vocab_size=30522,
+                 max_seq_len=77,
+                 device='cuda',
+                 use_tokenizer=True,
+                 embedding_dropout=0.0):
+        super().__init__()
+        self.use_tknz_fn = use_tokenizer
+        if self.use_tknz_fn:
+            self.tknz_fn = BERTTokenizer(
+                vq_interface=False, max_length=max_seq_len)
+        self.device = device
+        self.transformer = TransformerWrapper(
+            num_tokens=vocab_size,
+            max_seq_len=max_seq_len,
+            attn_layers=Encoder(dim=n_embed, depth=n_layer),
+            emb_dropout=embedding_dropout)
+
+    def forward(self, text):
+        if self.use_tknz_fn:
+            tokens = self.tknz_fn(text)  # .to(self.device)
+        else:
+            tokens = text
+        z = self.transformer(tokens, return_embeddings=True)
+        return z
+
+    def encode(self, text):
+        # output of length 77
+        return self(text)
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+class FrozenT5Embedder(AbstractEncoder):
+    """Uses the T5 transformer encoder for text"""
+
+    def __init__(self,
+                 version='google/t5-v1_1-large',
+                 device='cuda',
+                 max_length=77
+                 ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = T5Tokenizer.from_pretrained(
+            version,
+            cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models'
+        )
+        self.transformer = T5EncoderModel.from_pretrained(
+            version,
+            cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models'
+        )
+        self.device = device
+        self.max_length = max_length  # TODO: typical value?
+        self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt')
+        tokens = batch_encoding['input_ids'].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenFaceEncoder(AbstractEncoder):
+
+    def __init__(self, model_path, augment=False):
+        super().__init__()
+        self.loss_fn = IDFeatures(model_path)
+        # face encoder is frozen
+        for p in self.loss_fn.parameters():
+            p.requires_grad = False
+        # Mapper is trainable
+        self.mapper = torch.nn.Linear(512, 768)
+        p = 0.25
+        if augment:
+            self.augment = K.AugmentationSequential(
+                K.RandomHorizontalFlip(p=0.5),
+                K.RandomEqualize(p=p),
+                # K.RandomPlanckianJitter(p=p),
+                # K.RandomPlasmaBrightness(p=p),
+                # K.RandomPlasmaContrast(p=p),
+                # K.ColorJiggle(0.02, 0.2, 0.2, p=p),
+            )
+        else:
+            self.augment = False
+
+    def forward(self, img):
+        if isinstance(img, list):
+            # Uncondition
+            return torch.zeros((1, 1, 768), device=self.mapper.weight.device)
+
+        if self.augment is not None:
+            # Transforms require 0-1
+            img = self.augment((img + 1) / 2)
+            img = 2 * img - 1
+
+        feat = self.loss_fn(img, crop=True)
+        feat = self.mapper(feat.unsqueeze(1))
+        return feat
+
+    def encode(self, img):
+        return self(img)
+
+
+class FrozenCLIPEmbedder(AbstractEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+
+    def __init__(self,
+                 version='openai/clip-vit-large-patch14',
+                 device='cuda',
+                 max_length=77):  # clip-vit-base-patch32
+        super().__init__()
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            version,
+            cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models'
+        )
+        self.transformer = CLIPTextModel.from_pretrained(
+            version,
+            cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models'
+        )
+        self.device = device
+        self.max_length = max_length  # TODO: typical value?
+        self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        # self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding='max_length',
+            return_tensors='pt')
+        tokens = batch_encoding['input_ids'].to(self.device)
+        outputs = self.transformer(input_ids=tokens)
+
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class ClipImageProjector(AbstractEncoder):
+    """
+        Uses the CLIP image encoder.
+        """
+
+    def __init__(self,
+                 version='openai/clip-vit-large-patch14',
+                 max_length=77):  # clip-vit-base-patch32
+        super().__init__()
+        self.model = CLIPVisionModel.from_pretrained(version)
+        self.model.train()
+        self.max_length = max_length  # TODO: typical value?
+        self.antialias = True
+        self.mapper = torch.nn.Linear(1024, 768)
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.48145466, 0.4578275, 0.40821073]),
+            persistent=False)
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.26862954, 0.26130258, 0.27577711]),
+            persistent=False)
+        null_cond = self.get_null_cond(version, max_length)
+        self.register_buffer('null_cond', null_cond)
+
+    @torch.no_grad()
+    def get_null_cond(self, version, max_length):
+        device = self.mean.device
+        embedder = FrozenCLIPEmbedder(
+            version=version, device=device, max_length=max_length)
+        null_cond = embedder([''])
+        return null_cond
+
+    def preprocess(self, x):
+        # Expects inputs in the range -1, 1
+        x = kornia.geometry.resize(
+            x, (224, 224),
+            interpolation='bicubic',
+            align_corners=True,
+            antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x):
+        if isinstance(x, list):
+            return self.null_cond
+        # x is assumed to be in range [-1,1]
+        x = self.preprocess(x)
+        outputs = self.model(pixel_values=x)
+        last_hidden_state = outputs.last_hidden_state
+        last_hidden_state = self.mapper(last_hidden_state)
+        return F.pad(
+            last_hidden_state,
+            [0, 0, 0, self.max_length - last_hidden_state.shape[1], 0, 0])
+
+    def encode(self, im):
+        return self(im)
+
+
+class ProjectedFrozenCLIPEmbedder(AbstractEncoder):
+
+    def __init__(self,
+                 version='openai/clip-vit-large-patch14',
+                 device='cuda',
+                 max_length=77):  # clip-vit-base-patch32
+        super().__init__()
+        self.embedder = FrozenCLIPEmbedder(
+            version=version, device=device, max_length=max_length)
+        self.projection = torch.nn.Linear(768, 768)
+
+    def forward(self, text):
+        z = self.embedder(text)
+        return self.projection(z)
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPImageEmbedder(AbstractEncoder):
+    """
+        Uses the CLIP image encoder.
+        Not actually frozen... If you want that set cond_stage_trainable=False in cfg
+        """
+
+    def __init__(
+        self,
+        model='ViT-L/14',
+        jit=False,
+        device='cpu',
+        antialias=False,
+    ):
+        super().__init__()
+        self.model, _ = clip.load(name=model, device=device, jit=jit)
+        # We don't use the text part so delete it
+        del self.model.transformer
+        self.antialias = antialias
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.48145466, 0.4578275, 0.40821073]),
+            persistent=False)
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.26862954, 0.26130258, 0.27577711]),
+            persistent=False)
+
+    def preprocess(self, x):
+        # Expects inputs in the range -1, 1
+        x = kornia.geometry.resize(
+            x, (224, 224),
+            interpolation='bicubic',
+            align_corners=True,
+            antialias=self.antialias)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x):
+        # x is assumed to be in range [-1,1]
+        if isinstance(x, list):
+            # [""] denotes condition dropout for ucg
+            device = self.model.visual.conv1.weight.device
+            return torch.zeros(1, 768, device=device)
+        return self.model.encode_image(self.preprocess(x)).float()
+
+    def encode(self, im):
+        return self(im).unsqueeze(1)
+
+
+class FrozenCLIPImageMutliEmbedder(AbstractEncoder):
+    """
+        Uses the CLIP image encoder.
+        Not actually frozen... If you want that set cond_stage_trainable=False in cfg
+        """
+
+    def __init__(
+        self,
+        model='ViT-L/14',
+        jit=False,
+        device='cpu',
+        antialias=True,
+        max_crops=5,
+    ):
+        super().__init__()
+        self.model, _ = clip.load(name=model, device=device, jit=jit)
+        # We don't use the text part so delete it
+        del self.model.transformer
+        self.antialias = antialias
+        self.register_buffer(
+            'mean',
+            torch.Tensor([0.48145466, 0.4578275, 0.40821073]),
+            persistent=False)
+        self.register_buffer(
+            'std',
+            torch.Tensor([0.26862954, 0.26130258, 0.27577711]),
+            persistent=False)
+        self.max_crops = max_crops
+
+    def preprocess(self, x):
+
+        # Expects inputs in the range -1, 1
+        randcrop = transforms.RandomResizedCrop(
+            224, scale=(0.085, 1.0), ratio=(1, 1))
+        max_crops = self.max_crops
+        patches = []
+        crops = [randcrop(x) for _ in range(max_crops)]
+        patches.extend(crops)
+        x = torch.cat(patches, dim=0)
+        x = (x + 1.) / 2.
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def forward(self, x):
+        # x is assumed to be in range [-1,1]
+        if isinstance(x, list):
+            # [""] denotes condition dropout for ucg
+            device = self.model.visual.conv1.weight.device
+            return torch.zeros(1, self.max_crops, 768, device=device)
+        batch_tokens = []
+        for im in x:
+            patches = self.preprocess(im.unsqueeze(0))
+            tokens = self.model.encode_image(patches).float()
+            for t in tokens:
+                if random.random() < 0.1:
+                    t *= 0
+            batch_tokens.append(tokens.unsqueeze(0))
+
+        return torch.cat(batch_tokens, dim=0)
+
+    def encode(self, im):
+        return self(im)
+
+
+class SpatialRescaler(nn.Module):
+
+    def __init__(self,
+                 n_stages=1,
+                 method='bilinear',
+                 multiplier=0.5,
+                 in_channels=3,
+                 out_channels=None,
+                 bias=False):
+        super().__init__()
+        self.n_stages = n_stages
+        assert self.n_stages >= 0
+        assert method in [
+            'nearest', 'linear', 'bilinear', 'trilinear', 'bicubic', 'area'
+        ]
+        self.multiplier = multiplier
+        self.interpolator = partial(
+            torch.nn.functional.interpolate, mode=method)
+        self.remap_output = out_channels is not None
+        if self.remap_output:
+            print(
+                f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.'
+            )
+            self.channel_mapper = nn.Conv2d(
+                in_channels, out_channels, 1, bias=bias)
+
+    def forward(self, x):
+        for stage in range(self.n_stages):
+            x = self.interpolator(x, scale_factor=self.multiplier)
+
+        if self.remap_output:
+            x = self.channel_mapper(x)
+        return x
+
+    def encode(self, x):
+        return self(x)
+
+
+class LowScaleEncoder(nn.Module):
+
+    def __init__(self,
+                 model_config,
+                 linear_start,
+                 linear_end,
+                 timesteps=1000,
+                 max_noise_level=250,
+                 output_size=64,
+                 scale_factor=1.0):
+        super().__init__()
+        self.max_noise_level = max_noise_level
+        self.model = instantiate_from_config(model_config)
+        self.augmentation_schedule = self.register_schedule(
+            timesteps=timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end)
+        self.out_size = output_size
+        self.scale_factor = scale_factor
+
+    def register_schedule(self,
+                          beta_schedule='linear',
+                          timesteps=1000,
+                          linear_start=1e-4,
+                          linear_end=2e-2,
+                          cosine_s=8e-3):
+        betas = make_beta_schedule(
+            beta_schedule,
+            timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert alphas_cumprod.shape[
+            0] == self.num_timesteps, 'alphas have to be defined for each timestep'
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer('betas', to_torch(betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev',
+                             to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod',
+                             to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod',
+                             to_torch(np.sqrt(1. - alphas_cumprod)))
+        self.register_buffer('log_one_minus_alphas_cumprod',
+                             to_torch(np.log(1. - alphas_cumprod)))
+        self.register_buffer('sqrt_recip_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod)))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod',
+                             to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape)
+                * x_start
+                + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t,
+                                      x_start.shape) * noise)
+
+    def forward(self, x):
+        z = self.model.encode(x).sample()
+        z = z * self.scale_factor
+        noise_level = torch.randint(
+            0, self.max_noise_level, (x.shape[0], ), device=x.device).long()
+        z = self.q_sample(z, noise_level)
+        if self.out_size is not None:
+            z = torch.nn.functional.interpolate(
+                z, size=self.out_size,
+                mode='nearest')  # TODO: experiment with mode
+        # z = z.repeat_interleave(2, -2).repeat_interleave(2, -1)
+        return z, noise_level
+
+    def decode(self, z):
+        z = z / self.scale_factor
+        return self.model.decode(z)
+
+
+if __name__ == '__main__':
+    from ldm.util import count_params
+    sentences = [
+        'a hedgehog drinking a whiskey', 'der mond ist aufgegangen',
+        "Ein Satz mit vielen Sonderzeichen: äöü ß ?! : 'xx-y/@s'"
+    ]
+    model = FrozenT5Embedder(version='google/t5-v1_1-xl').cuda()
+    count_params(model, True)
+    z = model(sentences)
+    print(z.shape)
+
+    model = FrozenCLIPEmbedder().cuda()
+    count_params(model, True)
+    z = model(sentences)
+    print(z.shape)
+
+    print('done.')
diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/x_transformer.py b/modelscope/models/cv/image_to_3d/ldm/modules/x_transformer.py
new file mode 100644
index 000000000..0e5d7b8f7
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/modules/x_transformer.py
@@ -0,0 +1,682 @@
+"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers"""
+from collections import namedtuple
+from functools import partial
+from inspect import isfunction
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange, reduce, repeat
+from torch import einsum, nn
+
+# constants
+
+DEFAULT_DIM_HEAD = 64
+
+Intermediates = namedtuple('Intermediates',
+                           ['pre_softmax_attn', 'post_softmax_attn'])
+
+LayerIntermediates = namedtuple('Intermediates',
+                                ['hiddens', 'attn_intermediates'])
+
+
+class AbsolutePositionalEmbedding(nn.Module):
+
+    def __init__(self, dim, max_seq_len):
+        super().__init__()
+        self.emb = nn.Embedding(max_seq_len, dim)
+        self.init_()
+
+    def init_(self):
+        nn.init.normal_(self.emb.weight, std=0.02)
+
+    def forward(self, x):
+        n = torch.arange(x.shape[1], device=x.device)
+        return self.emb(n)[None, :, :]
+
+
+class FixedPositionalEmbedding(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        inv_freq = 1. / (10000**(torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer('inv_freq', inv_freq)
+
+    def forward(self, x, seq_dim=1, offset=0):
+        t = torch.arange(
+            x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset
+        sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq)
+        emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1)
+        return emb[None, :, :]
+
+
+# helpers
+
+
+def exists(val):
+    return val is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def always(val):
+
+    def inner(*args, **kwargs):
+        return val
+
+    return inner
+
+
+def not_equals(val):
+
+    def inner(x):
+        return x != val
+
+    return inner
+
+
+def equals(val):
+
+    def inner(x):
+        return x == val
+
+    return inner
+
+
+def max_neg_value(tensor):
+    return -torch.finfo(tensor.dtype).max
+
+
+# keyword argument helpers
+
+
+def pick_and_pop(keys, d):
+    values = list(map(lambda key: d.pop(key), keys))
+    return dict(zip(keys, values))
+
+
+def group_dict_by_key(cond, d):
+    return_val = [dict(), dict()]
+    for key in d.keys():
+        match = bool(cond(key))
+        ind = int(not match)
+        return_val[ind][key] = d[key]
+    return (*return_val, )
+
+
+def string_begins_with(prefix, str):
+    return str.startswith(prefix)
+
+
+def group_by_key_prefix(prefix, d):
+    return group_dict_by_key(partial(string_begins_with, prefix), d)
+
+
+def groupby_prefix_and_trim(prefix, d):
+    kwargs_with_prefix, kwargs = group_dict_by_key(
+        partial(string_begins_with, prefix), d)
+    kwargs_without_prefix = dict(
+        map(lambda x: (x[0][len(prefix):], x[1]),
+            tuple(kwargs_with_prefix.items())))
+    return kwargs_without_prefix, kwargs
+
+
+# classes
+class Scale(nn.Module):
+
+    def __init__(self, value, fn):
+        super().__init__()
+        self.value = value
+        self.fn = fn
+
+    def forward(self, x, **kwargs):
+        x, *rest = self.fn(x, **kwargs)
+        return (x * self.value, *rest)
+
+
+class Rezero(nn.Module):
+
+    def __init__(self, fn):
+        super().__init__()
+        self.fn = fn
+        self.g = nn.Parameter(torch.zeros(1))
+
+    def forward(self, x, **kwargs):
+        x, *rest = self.fn(x, **kwargs)
+        return (x * self.g, *rest)
+
+
+class ScaleNorm(nn.Module):
+
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.scale = dim**-0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(1))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class RMSNorm(nn.Module):
+
+    def __init__(self, dim, eps=1e-8):
+        super().__init__()
+        self.scale = dim**-0.5
+        self.eps = eps
+        self.g = nn.Parameter(torch.ones(dim))
+
+    def forward(self, x):
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale
+        return x / norm.clamp(min=self.eps) * self.g
+
+
+class Residual(nn.Module):
+
+    def forward(self, x, residual):
+        return x + residual
+
+
+class GRUGating(nn.Module):
+
+    def __init__(self, dim):
+        super().__init__()
+        self.gru = nn.GRUCell(dim, dim)
+
+    def forward(self, x, residual):
+        gated_output = self.gru(
+            rearrange(x, 'b n d -> (b n) d'),
+            rearrange(residual, 'b n d -> (b n) d'))
+
+        return gated_output.reshape_as(x)
+
+
+# feedforward
+
+
+class GEGLU(nn.Module):
+
+    def __init__(self, dim_in, dim_out):
+        super().__init__()
+        self.proj = nn.Linear(dim_in, dim_out * 2)
+
+    def forward(self, x):
+        x, gate = self.proj(x).chunk(2, dim=-1)
+        return x * F.gelu(gate)
+
+
+class FeedForward(nn.Module):
+
+    def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.):
+        super().__init__()
+        inner_dim = int(dim * mult)
+        dim_out = default(dim_out, dim)
+        project_in = nn.Sequential(nn.Linear(
+            dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim)
+
+        self.net = nn.Sequential(project_in, nn.Dropout(dropout),
+                                 nn.Linear(inner_dim, dim_out))
+
+    def forward(self, x):
+        return self.net(x)
+
+
+# attention.
+class Attention(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 dim_head=DEFAULT_DIM_HEAD,
+                 heads=8,
+                 causal=False,
+                 mask=None,
+                 talking_heads=False,
+                 sparse_topk=None,
+                 use_entmax15=False,
+                 num_mem_kv=0,
+                 dropout=0.,
+                 on_attn=False):
+        super().__init__()
+        if use_entmax15:
+            raise NotImplementedError(
+                'Check out entmax activation instead of softmax activation!')
+        self.scale = dim_head**-0.5
+        self.heads = heads
+        self.causal = causal
+        self.mask = mask
+
+        inner_dim = dim_head * heads
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_k = nn.Linear(dim, inner_dim, bias=False)
+        self.to_v = nn.Linear(dim, inner_dim, bias=False)
+        self.dropout = nn.Dropout(dropout)
+
+        # talking heads
+        self.talking_heads = talking_heads
+        if talking_heads:
+            self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+            self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads))
+
+        # explicit topk sparse attention
+        self.sparse_topk = sparse_topk
+
+        # entmax
+        # self.attn_fn = entmax15 if use_entmax15 else F.softmax
+        self.attn_fn = F.softmax
+
+        # add memory key / values
+        self.num_mem_kv = num_mem_kv
+        if num_mem_kv > 0:
+            self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+            self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head))
+
+        # attention on attention
+        self.attn_on_attn = on_attn
+        self.to_out = nn.Sequential(nn.Linear(
+            inner_dim, dim
+            * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim)
+
+    def forward(self,
+                x,
+                context=None,
+                mask=None,
+                context_mask=None,
+                rel_pos=None,
+                sinusoidal_emb=None,
+                prev_attn=None,
+                mem=None):
+        b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device
+        kv_input = default(context, x)
+
+        q_input = x
+        k_input = kv_input
+        v_input = kv_input
+
+        if exists(mem):
+            k_input = torch.cat((mem, k_input), dim=-2)
+            v_input = torch.cat((mem, v_input), dim=-2)
+
+        if exists(sinusoidal_emb):
+            # in shortformer, the query would start at a position offset depending on the past cached memory
+            offset = k_input.shape[-2] - q_input.shape[-2]
+            q_input = q_input + sinusoidal_emb(q_input, offset=offset)
+            k_input = k_input + sinusoidal_emb(k_input)
+
+        q = self.to_q(q_input)
+        k = self.to_k(k_input)
+        v = self.to_v(v_input)
+
+        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h),
+                      (q, k, v))
+
+        input_mask = None
+        if any(map(exists, (mask, context_mask))):
+            q_mask = default(mask, lambda: torch.ones(
+                (b, n), device=device).bool())
+            k_mask = q_mask if not exists(context) else context_mask
+            k_mask = default(
+                k_mask, lambda: torch.ones(
+                    (b, k.shape[-2]), device=device).bool())
+            q_mask = rearrange(q_mask, 'b i -> b () i ()')
+            k_mask = rearrange(k_mask, 'b j -> b () () j')
+            input_mask = q_mask * k_mask
+
+        if self.num_mem_kv > 0:
+            mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b),
+                               (self.mem_k, self.mem_v))
+            k = torch.cat((mem_k, k), dim=-2)
+            v = torch.cat((mem_v, v), dim=-2)
+            if exists(input_mask):
+                input_mask = F.pad(
+                    input_mask, (self.num_mem_kv, 0), value=True)
+
+        dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale
+        mask_value = max_neg_value(dots)
+
+        if exists(prev_attn):
+            dots = dots + prev_attn
+
+        pre_softmax_attn = dots
+
+        if talking_heads:
+            dots = einsum('b h i j, h k -> b k i j', dots,
+                          self.pre_softmax_proj).contiguous()
+
+        if exists(rel_pos):
+            dots = rel_pos(dots)
+
+        if exists(input_mask):
+            dots.masked_fill_(~input_mask, mask_value)
+            del input_mask
+
+        if self.causal:
+            i, j = dots.shape[-2:]
+            r = torch.arange(i, device=device)
+            mask = rearrange(r, 'i -> () () i ()') < rearrange(
+                r, 'j -> () () () j')
+            mask = F.pad(mask, (j - i, 0), value=False)
+            dots.masked_fill_(mask, mask_value)
+            del mask
+
+        if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]:
+            top, _ = dots.topk(self.sparse_topk, dim=-1)
+            vk = top[..., -1].unsqueeze(-1).expand_as(dots)
+            mask = dots < vk
+            dots.masked_fill_(mask, mask_value)
+            del mask
+
+        attn = self.attn_fn(dots, dim=-1)
+        post_softmax_attn = attn
+
+        attn = self.dropout(attn)
+
+        if talking_heads:
+            attn = einsum('b h i j, h k -> b k i j', attn,
+                          self.post_softmax_proj).contiguous()
+
+        out = einsum('b h i j, b h j d -> b h i d', attn, v)
+        out = rearrange(out, 'b h n d -> b n (h d)')
+
+        intermediates = Intermediates(
+            pre_softmax_attn=pre_softmax_attn,
+            post_softmax_attn=post_softmax_attn)
+
+        return self.to_out(out), intermediates
+
+
+class AttentionLayers(nn.Module):
+
+    def __init__(self,
+                 dim,
+                 depth,
+                 heads=8,
+                 causal=False,
+                 cross_attend=False,
+                 only_cross=False,
+                 use_scalenorm=False,
+                 use_rmsnorm=False,
+                 use_rezero=False,
+                 rel_pos_num_buckets=32,
+                 rel_pos_max_distance=128,
+                 position_infused_attn=False,
+                 custom_layers=None,
+                 sandwich_coef=None,
+                 par_ratio=None,
+                 residual_attn=False,
+                 cross_residual_attn=False,
+                 macaron=False,
+                 pre_norm=True,
+                 gate_residual=False,
+                 **kwargs):
+        super().__init__()
+        ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs)
+        attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs)
+
+        # dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD)
+
+        self.dim = dim
+        self.depth = depth
+        self.layers = nn.ModuleList([])
+
+        self.has_pos_emb = position_infused_attn
+        self.pia_pos_emb = FixedPositionalEmbedding(
+            dim) if position_infused_attn else None
+        self.rotary_pos_emb = always(None)
+
+        assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than \
+        the relative position max distance'
+
+        self.rel_pos = None
+
+        self.pre_norm = pre_norm
+
+        self.residual_attn = residual_attn
+        self.cross_residual_attn = cross_residual_attn
+
+        norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm
+        norm_class = RMSNorm if use_rmsnorm else norm_class
+        norm_fn = partial(norm_class, dim)
+
+        norm_fn = nn.Identity if use_rezero else norm_fn
+        branch_fn = Rezero if use_rezero else None
+
+        if cross_attend and not only_cross:
+            default_block = ('a', 'c', 'f')
+        elif cross_attend and only_cross:
+            default_block = ('c', 'f')
+        else:
+            default_block = ('a', 'f')
+
+        if macaron:
+            default_block = ('f', ) + default_block
+
+        if exists(custom_layers):
+            layer_types = custom_layers
+        elif exists(par_ratio):
+            par_depth = depth * len(default_block)
+            assert 1 < par_ratio <= par_depth, 'par ratio out of range'
+            default_block = tuple(filter(not_equals('f'), default_block))
+            par_attn = par_depth // par_ratio
+            depth_cut = par_depth * 2 // 3  # 2 / 3 attention layer cutoff suggested by PAR paper
+            par_width = (depth_cut + depth_cut // par_attn) // par_attn
+            assert len(
+                default_block
+            ) <= par_width, 'default block is too large for par_ratio'
+            par_block = default_block + ('f', ) * (
+                par_width - len(default_block))
+            par_head = par_block * par_attn
+            layer_types = par_head + ('f', ) * (par_depth - len(par_head))
+        elif exists(sandwich_coef):
+            assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth'
+            layer_types = ('a', ) * sandwich_coef + default_block * (
+                depth - sandwich_coef) + ('f', ) * sandwich_coef
+        else:
+            layer_types = default_block * depth
+
+        self.layer_types = layer_types
+        self.num_attn_layers = len(list(filter(equals('a'), layer_types)))
+
+        for layer_type in self.layer_types:
+            if layer_type == 'a':
+                layer = Attention(
+                    dim, heads=heads, causal=causal, **attn_kwargs)
+            elif layer_type == 'c':
+                layer = Attention(dim, heads=heads, **attn_kwargs)
+            elif layer_type == 'f':
+                layer = FeedForward(dim, **ff_kwargs)
+                layer = layer if not macaron else Scale(0.5, layer)
+            else:
+                raise Exception(f'invalid layer type {layer_type}')
+
+            if isinstance(layer, Attention) and exists(branch_fn):
+                layer = branch_fn(layer)
+
+            if gate_residual:
+                residual_fn = GRUGating(dim)
+            else:
+                residual_fn = Residual()
+
+            self.layers.append(nn.ModuleList([norm_fn(), layer, residual_fn]))
+
+    def forward(self,
+                x,
+                context=None,
+                mask=None,
+                context_mask=None,
+                mems=None,
+                return_hiddens=False):
+        hiddens = []
+        intermediates = []
+        prev_attn = None
+        prev_cross_attn = None
+
+        mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers
+
+        for ind, (layer_type, (norm, block, residual_fn)) in enumerate(
+                zip(self.layer_types, self.layers)):
+            is_last = ind == (len(self.layers) - 1)
+
+            if layer_type == 'a':
+                hiddens.append(x)
+                layer_mem = mems.pop(0)
+
+            residual = x
+
+            if self.pre_norm:
+                x = norm(x)
+
+            if layer_type == 'a':
+                out, inter = block(
+                    x,
+                    mask=mask,
+                    sinusoidal_emb=self.pia_pos_emb,
+                    rel_pos=self.rel_pos,
+                    prev_attn=prev_attn,
+                    mem=layer_mem)
+            elif layer_type == 'c':
+                out, inter = block(
+                    x,
+                    context=context,
+                    mask=mask,
+                    context_mask=context_mask,
+                    prev_attn=prev_cross_attn)
+            elif layer_type == 'f':
+                out = block(x)
+
+            x = residual_fn(out, residual)
+
+            if layer_type in ('a', 'c'):
+                intermediates.append(inter)
+
+            if layer_type == 'a' and self.residual_attn:
+                prev_attn = inter.pre_softmax_attn
+            elif layer_type == 'c' and self.cross_residual_attn:
+                prev_cross_attn = inter.pre_softmax_attn
+
+            if not self.pre_norm and not is_last:
+                x = norm(x)
+
+        if return_hiddens:
+            intermediates = LayerIntermediates(
+                hiddens=hiddens, attn_intermediates=intermediates)
+
+            return x, intermediates
+
+        return x
+
+
+class Encoder(AttentionLayers):
+
+    def __init__(self, **kwargs):
+        assert 'causal' not in kwargs, 'cannot set causality on encoder'
+        super().__init__(causal=False, **kwargs)
+
+
+class TransformerWrapper(nn.Module):
+
+    def __init__(self,
+                 *,
+                 num_tokens,
+                 max_seq_len,
+                 attn_layers,
+                 emb_dim=None,
+                 max_mem_len=0.,
+                 emb_dropout=0.,
+                 num_memory_tokens=None,
+                 tie_embedding=False,
+                 use_pos_emb=True):
+        super().__init__()
+        assert isinstance(
+            attn_layers, AttentionLayers
+        ), 'attention layers must be one of Encoder or Decoder'
+
+        dim = attn_layers.dim
+        emb_dim = default(emb_dim, dim)
+
+        self.max_seq_len = max_seq_len
+        self.max_mem_len = max_mem_len
+        self.num_tokens = num_tokens
+
+        self.token_emb = nn.Embedding(num_tokens, emb_dim)
+        self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if (
+            use_pos_emb and not attn_layers.has_pos_emb) else always(0)
+        self.emb_dropout = nn.Dropout(emb_dropout)
+
+        self.project_emb = nn.Linear(emb_dim,
+                                     dim) if emb_dim != dim else nn.Identity()
+        self.attn_layers = attn_layers
+        self.norm = nn.LayerNorm(dim)
+
+        self.init_()
+
+        self.to_logits = nn.Linear(
+            dim, num_tokens
+        ) if not tie_embedding else lambda t: t @ self.token_emb.weight.t()
+
+        # memory tokens (like [cls]) from Memory Transformers paper
+        num_memory_tokens = default(num_memory_tokens, 0)
+        self.num_memory_tokens = num_memory_tokens
+        if num_memory_tokens > 0:
+            self.memory_tokens = nn.Parameter(
+                torch.randn(num_memory_tokens, dim))
+
+            # let funnel encoder know number of memory tokens, if specified
+            if hasattr(attn_layers, 'num_memory_tokens'):
+                attn_layers.num_memory_tokens = num_memory_tokens
+
+    def init_(self):
+        nn.init.normal_(self.token_emb.weight, std=0.02)
+
+    def forward(self,
+                x,
+                return_embeddings=False,
+                mask=None,
+                return_mems=False,
+                return_attn=False,
+                mems=None,
+                **kwargs):
+        b, _, _, num_mem = *x.shape, x.device, self.num_memory_tokens
+        x = self.token_emb(x)
+        x += self.pos_emb(x)
+        x = self.emb_dropout(x)
+
+        x = self.project_emb(x)
+
+        if num_mem > 0:
+            mem = repeat(self.memory_tokens, 'n d -> b n d', b=b)
+            x = torch.cat((mem, x), dim=1)
+
+            # auto-handle masking after appending memory tokens
+            if exists(mask):
+                mask = F.pad(mask, (num_mem, 0), value=True)
+
+        x, intermediates = self.attn_layers(
+            x, mask=mask, mems=mems, return_hiddens=True, **kwargs)
+        x = self.norm(x)
+
+        mem, x = x[:, :num_mem], x[:, num_mem:]
+
+        out = self.to_logits(x) if not return_embeddings else x
+
+        if return_mems:
+            hiddens = intermediates.hiddens
+            new_mems = list(
+                map(lambda pair: torch.cat(pair, dim=-2), zip(
+                    mems, hiddens))) if exists(mems) else hiddens
+            new_mems = list(
+                map(lambda t: t[..., -self.max_mem_len:, :].detach(),
+                    new_mems))
+            return out, new_mems
+
+        if return_attn:
+            attn_maps = list(
+                map(lambda t: t.post_softmax_attn,
+                    intermediates.attn_intermediates))
+            return out, attn_maps
+
+        return out
diff --git a/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/helpers.py b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/helpers.py
new file mode 100644
index 000000000..954db9cd5
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/helpers.py
@@ -0,0 +1,133 @@
+# https://github.com/eladrich/pixel2style2pixel
+
+from collections import namedtuple
+
+import torch
+from torch.nn import (AdaptiveAvgPool2d, BatchNorm2d, Conv2d, MaxPool2d,
+                      Module, PReLU, ReLU, Sequential, Sigmoid)
+
+# ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+
+
+class Flatten(Module):
+
+    def forward(self, input):
+        return input.view(input.size(0), -1)
+
+
+def l2_norm(input, axis=1):
+    norm = torch.norm(input, 2, axis, True)
+    output = torch.div(input, norm)
+    return output
+
+
+class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])):
+    """ A named tuple describing a ResNet block. """
+
+
+def get_block(in_channel, depth, num_units, stride=2):
+    return [Bottleneck(in_channel, depth, stride)
+            ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)]
+
+
+def get_blocks(num_layers):
+    if num_layers == 50:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=4),
+            get_block(in_channel=128, depth=256, num_units=14),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 100:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=13),
+            get_block(in_channel=128, depth=256, num_units=30),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    elif num_layers == 152:
+        blocks = [
+            get_block(in_channel=64, depth=64, num_units=3),
+            get_block(in_channel=64, depth=128, num_units=8),
+            get_block(in_channel=128, depth=256, num_units=36),
+            get_block(in_channel=256, depth=512, num_units=3)
+        ]
+    else:
+        raise ValueError(
+            'Invalid number of layers: {}. Must be one of [50, 100, 152]'.
+            format(num_layers))
+    return blocks
+
+
+class SEModule(Module):
+
+    def __init__(self, channels, reduction):
+        super(SEModule, self).__init__()
+        self.avg_pool = AdaptiveAvgPool2d(1)
+        self.fc1 = Conv2d(
+            channels,
+            channels // reduction,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.relu = ReLU(inplace=True)
+        self.fc2 = Conv2d(
+            channels // reduction,
+            channels,
+            kernel_size=1,
+            padding=0,
+            bias=False)
+        self.sigmoid = Sigmoid()
+
+    def forward(self, x):
+        module_input = x
+        x = self.avg_pool(x)
+        x = self.fc1(x)
+        x = self.relu(x)
+        x = self.fc2(x)
+        x = self.sigmoid(x)
+        return module_input * x
+
+
+class bottleneck_IR(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
+
+
+class bottleneck_IR_SE(Module):
+
+    def __init__(self, in_channel, depth, stride):
+        super(bottleneck_IR_SE, self).__init__()
+        if in_channel == depth:
+            self.shortcut_layer = MaxPool2d(1, stride)
+        else:
+            self.shortcut_layer = Sequential(
+                Conv2d(in_channel, depth, (1, 1), stride, bias=False),
+                BatchNorm2d(depth))
+        self.res_layer = Sequential(
+            BatchNorm2d(in_channel),
+            Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False),
+            PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False),
+            BatchNorm2d(depth), SEModule(depth, 16))
+
+    def forward(self, x):
+        shortcut = self.shortcut_layer(x)
+        res = self.res_layer(x)
+        return res + shortcut
diff --git a/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/id_loss.py b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/id_loss.py
new file mode 100644
index 000000000..c6cb52bc7
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/id_loss.py
@@ -0,0 +1,27 @@
+# https://github.com/eladrich/pixel2style2pixel
+import torch
+from torch import nn
+
+from modelscope.models.cv.image_to_3d.ldm.thirdp.psp.model_irse import Backbone
+
+
+class IDFeatures(nn.Module):
+
+    def __init__(self, model_path):
+        super(IDFeatures, self).__init__()
+        print('Loading ResNet ArcFace')
+        self.facenet = Backbone(
+            input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se')
+        self.facenet.load_state_dict(
+            torch.load(model_path, map_location='cpu'))
+        self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112))
+        self.facenet.eval()
+
+    def forward(self, x, crop=False):
+        # Not sure of the image range here
+        if crop:
+            x = torch.nn.functional.interpolate(x, (256, 256), mode='area')
+            x = x[:, :, 35:223, 32:220]
+        x = self.face_pool(x)
+        x_feats = self.facenet(x)
+        return x_feats
diff --git a/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/model_irse.py b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/model_irse.py
new file mode 100644
index 000000000..f3d6deab3
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/model_irse.py
@@ -0,0 +1,96 @@
+# https://github.com/eladrich/pixel2style2pixel
+
+from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear,
+                      Module, PReLU, Sequential)
+
+from modelscope.models.cv.image_to_3d.ldm.thirdp.psp.helpers import (
+    Flatten, bottleneck_IR, bottleneck_IR_SE, get_blocks, l2_norm)
+
+# Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch)
+
+
+class Backbone(Module):
+
+    def __init__(self,
+                 input_size,
+                 num_layers,
+                 mode='ir',
+                 drop_ratio=0.4,
+                 affine=True):
+        super(Backbone, self).__init__()
+        assert input_size in [112, 224], 'input_size should be 112 or 224'
+        assert num_layers in [50, 100,
+                              152], 'num_layers should be 50, 100 or 152'
+        assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se'
+        blocks = get_blocks(num_layers)
+        if mode == 'ir':
+            unit_module = bottleneck_IR
+        elif mode == 'ir_se':
+            unit_module = bottleneck_IR_SE
+        self.input_layer = Sequential(
+            Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64),
+            PReLU(64))
+        if input_size == 112:
+            self.output_layer = Sequential(
+                BatchNorm2d(512), Dropout(drop_ratio), Flatten(),
+                Linear(512 * 7 * 7, 512), BatchNorm1d(512, affine=affine))
+        else:
+            self.output_layer = Sequential(
+                BatchNorm2d(512), Dropout(drop_ratio), Flatten(),
+                Linear(512 * 14 * 14, 512), BatchNorm1d(512, affine=affine))
+
+        modules = []
+        for block in blocks:
+            for bottleneck in block:
+                modules.append(
+                    unit_module(bottleneck.in_channel, bottleneck.depth,
+                                bottleneck.stride))
+        self.body = Sequential(*modules)
+
+    def forward(self, x):
+        x = self.input_layer(x)
+        x = self.body(x)
+        x = self.output_layer(x)
+        return l2_norm(x)
+
+
+def IR_50(input_size):
+    """Constructs a ir-50 model."""
+    model = Backbone(
+        input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False)
+    return model
+
+
+def IR_101(input_size):
+    """Constructs a ir-101 model."""
+    model = Backbone(
+        input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False)
+    return model
+
+
+def IR_152(input_size):
+    """Constructs a ir-152 model."""
+    model = Backbone(
+        input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False)
+    return model
+
+
+def IR_SE_50(input_size):
+    """Constructs a ir_se-50 model."""
+    model = Backbone(
+        input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False)
+    return model
+
+
+def IR_SE_101(input_size):
+    """Constructs a ir_se-101 model."""
+    model = Backbone(
+        input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False)
+    return model
+
+
+def IR_SE_152(input_size):
+    """Constructs a ir_se-152 model."""
+    model = Backbone(
+        input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False)
+    return model
diff --git a/modelscope/models/cv/image_to_3d/ldm/util.py b/modelscope/models/cv/image_to_3d/ldm/util.py
new file mode 100644
index 000000000..83ac20a3e
--- /dev/null
+++ b/modelscope/models/cv/image_to_3d/ldm/util.py
@@ -0,0 +1,302 @@
+import importlib
+import os
+import time
+from inspect import isfunction
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import PIL
+import torch
+import torchvision
+from PIL import Image, ImageDraw, ImageFont
+from torch import optim
+
+
+def pil_rectangle_crop(im):
+    width, height = im.size  # Get dimensions
+
+    if width <= height:
+        left = 0
+        right = width
+        top = (height - width) / 2
+        bottom = (height + width) / 2
+    else:
+
+        top = 0
+        bottom = height
+        left = (width - height) / 2
+        bottom = (width + height) / 2
+
+    # Crop the center of the image
+    im = im.crop((left, top, right, bottom))
+    return im
+
+
+def add_margin(pil_img, color=0, size=256):
+    width, height = pil_img.size
+    result = Image.new(pil_img.mode, (size, size), color)
+    result.paste(pil_img, ((size - width) // 2, (size - height) // 2))
+    return result
+
+
+def create_carvekit_interface():
+    from carvekit.api.high import HiInterface
+    # Check doc strings for more information
+    interface = HiInterface(
+        object_type='object',  # Can be "object" or "hairs-like".
+        batch_size_seg=5,
+        batch_size_matting=1,
+        device='cuda' if torch.cuda.is_available() else 'cpu',
+        seg_mask_size=640,  # Use 640 for Tracer B7 and 320 for U2Net
+        matting_mask_size=2048,
+        trimap_prob_threshold=231,
+        trimap_dilation=30,
+        trimap_erosion_iters=5,
+        fp16=False)
+
+    return interface
+
+
+def load_and_preprocess(interface, input_im):
+    '''
+    :param input_im (PIL Image).
+    :return image (H, W, 3) array in [0, 1].
+    '''
+    # See https://github.com/Ir1d/image-background-remove-tool
+    image = input_im.convert('RGB')
+
+    image_without_background = interface([image])[0]
+    image_without_background = np.array(image_without_background)
+    est_seg = image_without_background > 127
+    image = np.array(image)
+    foreground = est_seg[:, :, -1].astype(np.bool_)
+    image[~foreground] = [255., 255., 255.]
+    x, y, w, h = cv2.boundingRect(foreground.astype(np.uint8))
+    image = image[y:y + h, x:x + w, :]
+    image = PIL.Image.fromarray(np.array(image))
+
+    # resize image such that long edge is 512
+    image.thumbnail([200, 200], Image.LANCZOS)
+    image = add_margin(image, (255, 255, 255), size=256)
+    image = np.array(image)
+
+    return image
+
+
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new('RGB', wh, color='white')
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = '\n'.join(xc[bi][start:start + nc]
+                          for start in range(0, len(xc[bi]), nc))
+
+        try:
+            draw.text((0, 0), lines, fill='black', font=font)
+        except UnicodeEncodeError:
+            print('Cant encode string for logging. Skipping.')
+
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+
+def exists(x):
+    return x is not None
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(
+            f'{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.'
+        )
+    return total_params
+
+
+def instantiate_from_config(config):
+    if 'target' not in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == '__is_unconditional__':
+            return None
+        raise KeyError('Expected key `target` to instantiate.')
+    return get_obj_from_str(config['target'])(**config.get('params', dict()))
+
+
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit('.', 1)
+    print(module)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(
+        self,  # noqa
+        params,  # noqa
+        lr=1.e-3,  # noqa
+        betas=(0.9, 0.999),  # noqa
+        eps=1.e-8,  # noqa
+        weight_decay=1.e-2,  # noqa
+        amsgrad=False,  # noqa
+        ema_decay=0.9999,  # ema decay to match previous code  # noqa
+        ema_power=1.,  # noqa
+        param_names=()):  # noqa
+        # TODO: check hyperparameters before using
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError('Invalid learning rate: {}'.format(lr))
+        if not 0.0 <= eps:
+            raise ValueError('Invalid epsilon value: {}'.format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError('Invalid beta parameter at index 0: {}'.format(
+                betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError('Invalid beta parameter at index 1: {}'.format(
+                betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError(
+                'Invalid weight_decay value: {}'.format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError('Invalid ema_decay value: {}'.format(ema_decay))
+        defaults = dict(
+            lr=lr,
+            betas=betas,
+            eps=eps,
+            weight_decay=weight_decay,
+            amsgrad=amsgrad,
+            ema_decay=ema_decay,
+            ema_power=ema_power,
+            param_names=param_names)
+        super().__init__(params, defaults)
+
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            # state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError(
+                        'AdamW does not support sparse gradients')
+                grads.append(p.grad)
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(
+                        p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(
+                            p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+
+            optim._functional.adamw(
+                params_with_grad,
+                grads,
+                exp_avgs,
+                exp_avg_sqs,
+                max_exp_avg_sqs,
+                state_steps,
+                amsgrad=amsgrad,
+                beta1=beta1,
+                beta2=beta2,
+                lr=group['lr'],
+                weight_decay=group['weight_decay'],
+                eps=group['eps'],
+                maximize=False)
+
+            cur_ema_decay = min(ema_decay, 1 - state['step']**-ema_power)
+            for param, ema_param in zip(params_with_grad,
+                                        ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(
+                    param.float(), alpha=1 - cur_ema_decay)
+
+        return loss
diff --git a/modelscope/models/cv/image_try_on/generator.py b/modelscope/models/cv/image_try_on/generator.py
index 47e2bc1a5..1b1552cc2 100644
--- a/modelscope/models/cv/image_try_on/generator.py
+++ b/modelscope/models/cv/image_try_on/generator.py
@@ -1,5 +1,5 @@
 # The implementation here is modified based on spade,
-# originally Apache 2.0 License and publicly avaialbe at https://github.com/NVlabs/SPADE
+# originally Apache 2.0 License and publicly available at https://github.com/NVlabs/SPADE
 
 import functools
 import os
diff --git a/modelscope/models/cv/image_try_on/landmark.py b/modelscope/models/cv/image_try_on/landmark.py
index f74416d54..489e59c30 100644
--- a/modelscope/models/cv/image_try_on/landmark.py
+++ b/modelscope/models/cv/image_try_on/landmark.py
@@ -1,5 +1,5 @@
 # The implementation here is modified based on hrnet,
-# originally Apache 2.0 License and publicly avaialbe at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
+# originally Apache 2.0 License and publicly available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation
 
 import logging
 import os
diff --git a/modelscope/models/cv/image_try_on/warping.py b/modelscope/models/cv/image_try_on/warping.py
index 6c9cf18cd..c0116e01b 100644
--- a/modelscope/models/cv/image_try_on/warping.py
+++ b/modelscope/models/cv/image_try_on/warping.py
@@ -1,5 +1,5 @@
 # The implementation here is modified based on flow-style-vton,
-# originally Apache 2.0 License and publicly avaialbe at https://github.com/SenHe/Flow-Style-VTON
+# originally Apache 2.0 License and publicly available at https://github.com/SenHe/Flow-Style-VTON
 
 from collections import OrderedDict
 from math import sqrt
diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
index 645821405..b97926884 100644
--- a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py
@@ -109,7 +109,7 @@ def forward(self, outputs, videos_metadata, samples_shape_with_padding):
                                                     1)  # remove the padding
             # resize the masks back to their original frames dataset size for evaluation:
             original_frames_size = video_metadata['original_frame_size']
-            tuple_size = tuple(original_frames_size.cpu().numpy())
+            tuple_size = tuple(original_frames_size.cpu())
             video_pred_masks = F.interpolate(
                 video_pred_masks.float(), size=tuple_size, mode='nearest')
             video_pred_masks = video_pred_masks.to(torch.uint8).cpu()
diff --git a/modelscope/models/cv/self_supervised_depth_completion/__init__.py b/modelscope/models/cv/self_supervised_depth_completion/__init__.py
new file mode 100644
index 000000000..e8e8e4cf7
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .self_supervised_depth_completion import SelfSupervisedDepthCompletion
+else:
+    _import_structure = {
+        'selfsuperviseddepthcompletion': ['SelfSupervisedDepthCompletion'],
+    }
+
+    import sys
+
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/models/cv/self_supervised_depth_completion/criteria.py b/modelscope/models/cv/self_supervised_depth_completion/criteria.py
new file mode 100644
index 000000000..d221ae58b
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/criteria.py
@@ -0,0 +1,98 @@
+import torch
+import torch.nn as nn
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+loss_names = ['l1', 'l2']
+
+
+class MaskedMSELoss(nn.Module):
+
+    def __init__(self):
+        super(MaskedMSELoss, self).__init__()
+
+    def forward(self, pred, target):
+        assert pred.dim() == target.dim(), 'inconsistent dimensions'
+        valid_mask = (target > 0).detach()
+        diff = target - pred
+        diff = diff[valid_mask]
+        self.loss = (diff**2).mean()
+        return self.loss
+
+
+class MaskedL1Loss(nn.Module):
+
+    def __init__(self):
+        super(MaskedL1Loss, self).__init__()
+
+    def forward(self, pred, target, weight=None):
+        assert pred.dim() == target.dim(), 'inconsistent dimensions'
+        valid_mask = (target > 0).detach()
+        diff = target - pred
+        diff = diff[valid_mask]
+        self.loss = diff.abs().mean()
+        return self.loss
+
+
+class PhotometricLoss(nn.Module):
+
+    def __init__(self):
+        super(PhotometricLoss, self).__init__()
+
+    def forward(self, target, recon, mask=None):
+
+        assert recon.dim(
+        ) == 4, 'expected recon dimension to be 4, but instead got {}.'.format(
+            recon.dim())
+        assert target.dim(
+        ) == 4, 'expected target dimension to be 4, but instead got {}.'.format(
+            target.dim())
+        assert recon.size() == target.size(), 'expected recon and target to have the same size, but got {} and {} '\
+            .format(recon.size(), target.size())
+        diff = (target - recon).abs()
+        diff = torch.sum(diff, 1)  # sum along the color channel
+
+        # compare only pixels that are not black
+        valid_mask = (torch.sum(recon, 1) > 0).float() * (torch.sum(target, 1)
+                                                          > 0).float()
+        if mask is not None:
+            valid_mask = valid_mask * torch.squeeze(mask).float()
+        valid_mask = valid_mask.byte().detach()
+        if valid_mask.numel() > 0:
+            diff = diff[valid_mask]
+            if diff.nelement() > 0:
+                self.loss = diff.mean()
+            else:
+                logger.info(
+                    'warning: diff.nelement()==0 in PhotometricLoss (this is expected during early stage of training, \
+                        try larger batch size).')
+                self.loss = 0
+        else:
+            logger.info('warning: 0 valid pixel in PhotometricLoss')
+            self.loss = 0
+        return self.loss
+
+
+class SmoothnessLoss(nn.Module):
+
+    def __init__(self):
+        super(SmoothnessLoss, self).__init__()
+
+    def forward(self, depth):
+
+        def second_derivative(x):
+            assert x.dim(
+            ) == 4, 'expected 4-dimensional data, but instead got {}'.format(
+                x.dim())
+            horizontal = 2 * x[:, :, 1:-1, 1:-1] - x[:, :,
+                                                     1:-1, :-2] - x[:, :, 1:-1,
+                                                                    2:]
+            vertical = 2 * x[:, :, 1:-1, 1:-1] - x[:, :, :-2,
+                                                   1:-1] - x[:, :, 2:, 1:-1]
+            der_2nd = horizontal.abs() + vertical.abs()
+            return der_2nd.mean()
+
+        self.loss = second_derivative(depth)
+        return self.loss
diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/__init__.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/kitti_loader.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/kitti_loader.py
new file mode 100644
index 000000000..937be3bfb
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/kitti_loader.py
@@ -0,0 +1,344 @@
+import glob
+import os
+import os.path
+from random import choice
+
+import cv2
+import numpy as np
+import torch.utils.data as data
+from numpy import linalg as LA
+from PIL import Image
+
+from modelscope.models.cv.self_supervised_depth_completion.dataloaders import \
+    transforms
+from modelscope.models.cv.self_supervised_depth_completion.dataloaders.pose_estimator import \
+    get_pose_pnp
+
+input_options = ['d', 'rgb', 'rgbd', 'g', 'gd']
+
+
+def load_calib(args):
+    """
+    Temporarily hardcoding the calibration matrix using calib file from 2011_09_26
+    """
+    calib = open(os.path.join(args.data_folder, 'calib_cam_to_cam.txt'), 'r')
+    lines = calib.readlines()
+    P_rect_line = lines[25]
+
+    Proj_str = P_rect_line.split(':')[1].split(' ')[1:]
+    Proj = np.reshape(np.array([float(p) for p in Proj_str]),
+                      (3, 4)).astype(np.float32)
+    K = Proj[:3, :3]  # camera matrix
+
+    # note: we will take the center crop of the images during augmentation
+    # that changes the optical centers, but not focal lengths
+    K[0, 2] = K[
+        0,
+        2] - 13  # from width = 1242 to 1216, with a 13-pixel cut on both sides
+    K[1, 2] = K[
+        1,
+        2] - 11.5  # from width = 375 to 352, with a 11.5-pixel cut on both sides
+    return K
+
+
+def get_paths_and_transform(split, args):
+    assert (args.use_d or args.use_rgb
+            or args.use_g), 'no proper input selected'
+
+    if split == 'train':
+        transform = train_transform
+        glob_d = os.path.join(
+            args.data_folder,
+            'data_depth_velodyne/train/*_sync/proj_depth/velodyne_raw/image_0[2,3]/*.png'
+        )
+        glob_gt = os.path.join(
+            args.data_folder,
+            'data_depth_annotated/train/*_sync/proj_depth/groundtruth/image_0[2,3]/*.png'
+        )
+
+        def get_rgb_paths(p):
+            ps = p.split('/')
+            pnew = '/'.join([args.data_folder] + ['data_rgb'] + ps[-6:-4]
+                            + ps[-2:-1] + ['data'] + ps[-1:])
+            return pnew
+    elif split == 'val':
+        if args.val == 'full':
+            transform = val_transform
+            glob_d = os.path.join(
+                args.data_folder,
+                'data_depth_velodyne/val/*_sync/proj_depth/velodyne_raw/image_0[2,3]/*.png'
+            )
+            glob_gt = os.path.join(
+                args.data_folder,
+                'data_depth_annotated/val/*_sync/proj_depth/groundtruth/image_0[2,3]/*.png'
+            )
+
+            def get_rgb_paths(p):
+                ps = p.split('/')
+                pnew = '/'.join(ps[:-7] + ['data_rgb '] + ps[-6:-4] + ps[-2:-1]
+                                + ['data'] + ps[-1:])
+                return pnew
+        elif args.val == 'select':
+            transform = no_transform
+            glob_d = os.path.join(
+                args.data_folder,
+                'depth_selection/val_selection_cropped/velodyne_raw/*.png')
+            glob_gt = os.path.join(
+                args.data_folder,
+                'depth_selection/val_selection_cropped/groundtruth_depth/*.png'
+            )
+
+            def get_rgb_paths(p):
+                return p.replace('groundtruth_depth', 'image')
+    elif split == 'test_completion':
+        transform = no_transform
+        glob_d = os.path.join(
+            args.data_folder,
+            'depth_selection/test_depth_completion_anonymous/velodyne_raw/*.png'
+        )
+        glob_gt = None  # "test_depth_completion_anonymous/"
+        glob_rgb = os.path.join(
+            args.data_folder,
+            'depth_selection/test_depth_completion_anonymous/image/*.png')
+    elif split == 'test_prediction':
+        transform = no_transform
+        glob_d = None
+        glob_gt = None  # "test_depth_completion_anonymous/"
+        glob_rgb = os.path.join(
+            args.data_folder,
+            'depth_selection/test_depth_prediction_anonymous/image/*.png')
+    else:
+        raise ValueError('Unrecognized split ' + str(split))
+
+    if glob_gt is not None:
+        # train or val-full or val-select
+        paths_d = sorted(glob.glob(glob_d))
+        paths_gt = sorted(glob.glob(glob_gt))
+        paths_rgb = [get_rgb_paths(p) for p in paths_gt]
+    else:
+        # test only has d or rgb
+        paths_rgb = sorted(glob.glob(glob_rgb))
+        paths_gt = [None] * len(paths_rgb)
+        if split == 'test_prediction':
+            paths_d = [None] * len(
+                paths_rgb)  # test_prediction has no sparse depth
+        else:
+            paths_d = sorted(glob.glob(glob_d))
+
+    if len(paths_d) == 0 and len(paths_rgb) == 0 and len(paths_gt) == 0:
+        raise (RuntimeError('Found 0 images under {}'.format(glob_gt)))
+    if len(paths_d) == 0 and args.use_d:
+        raise (RuntimeError('Requested sparse depth but none was found'))
+    if len(paths_rgb) == 0 and args.use_rgb:
+        raise (RuntimeError('Requested rgb images but none was found'))
+    if len(paths_rgb) == 0 and args.use_g:
+        raise (RuntimeError('Requested gray images but no rgb was found'))
+    if len(paths_rgb) != len(paths_d) or len(paths_rgb) != len(paths_gt):
+        raise (RuntimeError('Produced different sizes for datasets'))
+
+    paths = {'rgb': paths_rgb, 'd': paths_d, 'gt': paths_gt}
+    return paths, transform
+
+
+def rgb_read(filename):
+    assert os.path.exists(filename), 'file not found: {}'.format(filename)
+    img_file = Image.open(filename)
+    # rgb_png = np.array(img_file, dtype=float) / 255.0 # scale pixels to the range [0,1]
+    rgb_png = np.array(img_file, dtype='uint8')  # in the range [0,255]
+    img_file.close()
+    return rgb_png
+
+
+def depth_read(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array,
+    # for details see readme.txt
+    assert os.path.exists(filename), 'file not found: {}'.format(filename)
+    img_file = Image.open(filename)
+    depth_png = np.array(img_file, dtype=int)
+    img_file.close()
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255, \
+        'np.max(depth_png)={}, path={}'.format(np.max(depth_png), filename)
+
+    depth = depth_png.astype(float) / 256.
+    # depth[depth_png == 0] = -1.
+    depth = np.expand_dims(depth, -1)
+    return depth
+
+
+oheight, owidth = 352, 1216
+
+
+def drop_depth_measurements(depth, prob_keep):
+    mask = np.random.binomial(1, prob_keep, depth.shape)
+    depth *= mask
+    return depth
+
+
+def train_transform(rgb, sparse, target, rgb_near, args):
+    # s = np.random.uniform(1.0, 1.5) # random scaling
+    # angle = np.random.uniform(-5.0, 5.0) # random rotation degrees
+    do_flip = np.random.uniform(0.0, 1.0) < 0.5  # random horizontal flip
+
+    transform_geometric = transforms.Compose([
+        # transforms.Rotate(angle),
+        # transforms.Resize(s),
+        transforms.BottomCrop((oheight, owidth)),
+        transforms.HorizontalFlip(do_flip)
+    ])
+    if sparse is not None:
+        sparse = transform_geometric(sparse)
+    target = transform_geometric(target)
+    if rgb is not None:
+        brightness = np.random.uniform(
+            max(0, 1 - args.jitter), 1 + args.jitter)
+        contrast = np.random.uniform(max(0, 1 - args.jitter), 1 + args.jitter)
+        saturation = np.random.uniform(
+            max(0, 1 - args.jitter), 1 + args.jitter)
+        transform_rgb = transforms.Compose([
+            transforms.ColorJitter(brightness, contrast, saturation, 0),
+            transform_geometric
+        ])
+        rgb = transform_rgb(rgb)
+        if rgb_near is not None:
+            rgb_near = transform_rgb(rgb_near)
+    # sparse = drop_depth_measurements(sparse, 0.9)
+
+    return rgb, sparse, target, rgb_near
+
+
+def val_transform(rgb, sparse, target, rgb_near, args):
+    transform = transforms.Compose([
+        transforms.BottomCrop((oheight, owidth)),
+    ])
+    if rgb is not None:
+        rgb = transform(rgb)
+    if sparse is not None:
+        sparse = transform(sparse)
+    if target is not None:
+        target = transform(target)
+    if rgb_near is not None:
+        rgb_near = transform(rgb_near)
+    return rgb, sparse, target, rgb_near
+
+
+def no_transform(rgb, sparse, target, rgb_near, args):
+    return rgb, sparse, target, rgb_near
+
+
+to_tensor = transforms.ToTensor()
+
+
+def to_float_tensor(x):
+    return to_tensor(x).float()
+
+
+def handle_gray(rgb, args):
+    if rgb is None:
+        return None, None
+    if not args.use_g:
+        return rgb, None
+    else:
+        img = np.array(Image.fromarray(rgb).convert('L'))
+        img = np.expand_dims(img, -1)
+        if not args.use_rgb:
+            rgb_ret = None
+        else:
+            rgb_ret = rgb
+        return rgb_ret, img
+
+
+def get_rgb_near(path, args):
+    assert path is not None, 'path is None'
+
+    def extract_frame_id(filename):
+        head, tail = os.path.split(filename)
+        number_string = tail[0:tail.find('.')]
+        number = int(number_string)
+        return head, number
+
+    def get_nearby_filename(filename, new_id):
+        head, _ = os.path.split(filename)
+        new_filename = os.path.join(head, '%010d.png' % new_id)
+        return new_filename
+
+    head, number = extract_frame_id(path)
+    count = 0
+    max_frame_diff = 3
+    candidates = [
+        i - max_frame_diff for i in range(max_frame_diff * 2 + 1)
+        if i - max_frame_diff != 0
+    ]
+    while True:
+        random_offset = choice(candidates)
+        path_near = get_nearby_filename(path, number + random_offset)
+        if os.path.exists(path_near):
+            break
+        assert count < 20, 'cannot find a nearby frame in 20 trials for {}'.format(
+            path)
+        count += 1
+
+    return rgb_read(path_near)
+
+
+class KittiDepth(data.Dataset):
+    """A data loader for the Kitti dataset
+    """
+
+    def __init__(self, split, args):
+        self.args = args
+        self.split = split
+        paths, transform = get_paths_and_transform(split, args)
+        self.paths = paths
+        self.transform = transform
+        self.K = load_calib(args)
+        self.threshold_translation = 0.1
+
+    def __getraw__(self, index):
+        rgb = rgb_read(self.paths['rgb'][index]) if \
+            (self.paths['rgb'][index] is not None and (self.args.use_rgb or self.args.use_g)) else None
+        sparse = depth_read(self.paths['d'][index]) if \
+            (self.paths['d'][index] is not None and self.args.use_d) else None
+        target = depth_read(self.paths['gt'][index]) if \
+            self.paths['gt'][index] is not None else None
+        rgb_near = get_rgb_near(self.paths['rgb'][index], self.args) if \
+            self.split == 'train' and self.args.use_pose else None
+        return rgb, sparse, target, rgb_near
+
+    def __getitem__(self, index):
+        rgb, sparse, target, rgb_near = self.__getraw__(index)
+        rgb, sparse, target, rgb_near = self.transform(rgb, sparse, target,
+                                                       rgb_near, self.args)
+        r_mat, t_vec = None, None
+        if self.split == 'train' and self.args.use_pose:
+            success, r_vec, t_vec = get_pose_pnp(rgb, rgb_near, sparse, self.K)
+            # discard if translation is too small
+            success = success and LA.norm(t_vec) > self.threshold_translation
+            if success:
+                r_mat, _ = cv2.Rodrigues(r_vec)
+            else:
+                # return the same image and no motion when PnP fails
+                rgb_near = rgb
+                t_vec = np.zeros((3, 1))
+                r_mat = np.eye(3)
+
+        rgb, gray = handle_gray(rgb, self.args)
+        candidates = {
+            'rgb': rgb,
+            'd': sparse,
+            'gt': target,
+            'g': gray,
+            'r_mat': r_mat,
+            't_vec': t_vec,
+            'rgb_near': rgb_near
+        }
+        items = {
+            key: to_float_tensor(val)
+            for key, val in candidates.items() if val is not None
+        }
+
+        return items
+
+    def __len__(self):
+        return len(self.paths['gt'])
diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/pose_estimator.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/pose_estimator.py
new file mode 100644
index 000000000..996725bf1
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/pose_estimator.py
@@ -0,0 +1,102 @@
+import cv2
+import numpy as np
+
+
+def rgb2gray(rgb):
+    return np.dot(rgb[..., :3], [0.299, 0.587, 0.114])
+
+
+def convert_2d_to_3d(u, v, z, K):
+    v0 = K[1][2]
+    u0 = K[0][2]
+    fy = K[1][1]
+    fx = K[0][0]
+    x = (u - u0) * z / fx
+    y = (v - v0) * z / fy
+    return (x, y, z)
+
+
+def feature_match(img1, img2):
+    r''' Find features on both images and match them pairwise
+   '''
+    max_n_features = 1000
+    # max_n_features = 500
+    use_flann = False  # better not use flann
+
+    detector = cv2.xfeatures2d.SIFT_create(max_n_features)
+
+    # find the keypoints and descriptors with SIFT
+    kp1, des1 = detector.detectAndCompute(img1, None)
+    kp2, des2 = detector.detectAndCompute(img2, None)
+    if (des1 is None) or (des2 is None):
+        return [], []
+    des1 = des1.astype(np.float32)
+    des2 = des2.astype(np.float32)
+
+    if use_flann:
+        # FLANN parameters
+        FLANN_INDEX_KDTREE = 0
+        index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
+        search_params = dict(checks=50)
+        flann = cv2.FlannBasedMatcher(index_params, search_params)
+        matches = flann.knnMatch(des1, des2, k=2)
+    else:
+        matcher = cv2.DescriptorMatcher().create('BruteForce')
+        matches = matcher.knnMatch(des1, des2, k=2)
+
+    good = []
+    pts1 = []
+    pts2 = []
+    # ratio test as per Lowe's paper
+    for i, (m, n) in enumerate(matches):
+        if m.distance < 0.8 * n.distance:
+            good.append(m)
+            pts2.append(kp2[m.trainIdx].pt)
+            pts1.append(kp1[m.queryIdx].pt)
+
+    pts1 = np.int32(pts1)
+    pts2 = np.int32(pts2)
+    return pts1, pts2
+
+
+def get_pose_pnp(rgb_curr, rgb_near, depth_curr, K):
+    gray_curr = rgb2gray(rgb_curr).astype(np.uint8)
+    gray_near = rgb2gray(rgb_near).astype(np.uint8)
+    height, width = gray_curr.shape
+
+    pts2d_curr, pts2d_near = feature_match(gray_curr,
+                                           gray_near)  # feature matching
+
+    # dilation of depth
+    kernel = np.ones((4, 4), np.uint8)
+    depth_curr_dilated = cv2.dilate(depth_curr, kernel)
+
+    # extract 3d pts
+    pts3d_curr = []
+    pts2d_near_filtered = [
+    ]  # keep only feature points with depth in the current frame
+    for i, pt2d in enumerate(pts2d_curr):
+        # print(pt2d)
+        u, v = pt2d[0], pt2d[1]
+        z = depth_curr_dilated[v, u]
+        if z > 0:
+            xyz_curr = convert_2d_to_3d(u, v, z, K)
+            pts3d_curr.append(xyz_curr)
+            pts2d_near_filtered.append(pts2d_near[i])
+
+    # the minimal number of points accepted by solvePnP is 4:
+    if len(pts3d_curr) >= 4 and len(pts2d_near_filtered) >= 4:
+        pts3d_curr = np.expand_dims(
+            np.array(pts3d_curr).astype(np.float32), axis=1)
+        pts2d_near_filtered = np.expand_dims(
+            np.array(pts2d_near_filtered).astype(np.float32), axis=1)
+
+        # ransac
+        ret = cv2.solvePnPRansac(
+            pts3d_curr, pts2d_near_filtered, K, distCoeffs=None)
+        success = ret[0]
+        rotation_vector = ret[1]
+        translation_vector = ret[2]
+        return (success, rotation_vector, translation_vector)
+    else:
+        return (0, None, None)
diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/transforms.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/transforms.py
new file mode 100644
index 000000000..2d4cab3c6
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/transforms.py
@@ -0,0 +1,617 @@
+from __future__ import division
+import numbers
+import types
+
+import numpy as np
+import scipy.ndimage.interpolation as itpl
+import skimage.transform
+import torch
+from PIL import Image, ImageEnhance
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+
+
+def _is_pil_image(img):
+    if accimage is not None:
+        return isinstance(img, (Image.Image, accimage.Image))
+    else:
+        return isinstance(img, Image.Image)
+
+
+def _is_tensor_image(img):
+    return torch.is_tensor(img) and img.ndimension() == 3
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjust brightness of an Image.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        brightness_factor (float):  How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        PIL Image: Brightness adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjust contrast of an Image.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        PIL Image: Contrast adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjust color saturation of an image.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        PIL Image: Saturation adjusted image.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+def adjust_hue(img, hue_factor):
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    See https://en.wikipedia.org/wiki/Hue for more details on Hue.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        PIL Image: Hue adjusted image.
+    """
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(
+            'hue_factor is not in [-0.5, 0.5]. Got {}'.format(hue_factor))
+
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    input_mode = img.mode
+    if input_mode in {'L', '1', 'I', 'F'}:
+        return img
+
+    h, s, v = img.convert('HSV').split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, 'L')
+
+    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+    return img
+
+
+def adjust_gamma(img, gamma, gain=1):
+    """Perform gamma correction on an image.
+
+    Also known as Power Law Transform. Intensities in RGB mode are adjusted
+    based on the following equation:
+
+        I_out = 255 * gain * ((I_in / 255) ** gamma)
+
+    See https://en.wikipedia.org/wiki/Gamma_correction for more details.
+
+    Args:
+        img (PIL Image): PIL Image to be adjusted.
+        gamma (float): Non negative real number. gamma larger than 1 make the
+            shadows darker, while gamma smaller than 1 make dark regions
+            lighter.
+        gain (float): The constant multiplier.
+    """
+    if not _is_pil_image(img):
+        raise TypeError('img should be PIL Image. Got {}'.format(type(img)))
+
+    if gamma < 0:
+        raise ValueError('Gamma should be a non-negative real number')
+
+    input_mode = img.mode
+    img = img.convert('RGB')
+
+    np_img = np.array(img, dtype=np.float32)
+    np_img = 255 * gain * ((np_img / 255)**gamma)
+    np_img = np.uint8(np.clip(np_img, 0, 255))
+
+    img = Image.fromarray(np_img, 'RGB').convert(input_mode)
+    return img
+
+
+class Compose(object):
+    """Composes several transforms together.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img):
+        for t in self.transforms:
+            img = t(img)
+        return img
+
+
+class ToTensor(object):
+    """Convert a ``numpy.ndarray`` to tensor.
+
+    Converts a numpy.ndarray (H x W x C) to a torch.FloatTensor of shape (C x H x W).
+    """
+
+    def __call__(self, img):
+        """Convert a ``numpy.ndarray`` to tensor.
+
+        Args:
+            img (numpy.ndarray): Image to be converted to tensor.
+
+        Returns:
+            Tensor: Converted image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+
+        if isinstance(img, np.ndarray):
+            # handle numpy array
+            if img.ndim == 3:
+                img = torch.from_numpy(img.transpose((2, 0, 1)).copy())
+            elif img.ndim == 2:
+                img = torch.from_numpy(img.copy())
+            else:
+                raise RuntimeError(
+                    'img should be ndarray with 2 or 3 dimensions. Got {}'.
+                    format(img.ndim))
+
+            return img
+
+
+class NormalizeNumpyArray(object):
+    """Normalize a ``numpy.ndarray`` with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(M1,..,Mn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``numpy.ndarray`` i.e.
+    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray): Image of size (H, W, C) to be normalized.
+
+        Returns:
+            Tensor: Normalized image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+        # TODO: make efficient
+        # print(img.shape)
+        for i in range(3):
+            img[:, :, i] = (img[:, :, i] - self.mean[i]) / self.std[i]
+        return img
+
+
+class NormalizeTensor(object):
+    """Normalize an tensor image with mean and standard deviation.
+    Given mean: ``(M1,...,Mn)`` and std: ``(M1,..,Mn)`` for ``n`` channels, this transform
+    will normalize each channel of the input ``torch.*Tensor`` i.e.
+    ``input[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+    """
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, tensor):
+        """
+        Args:
+            tensor (Tensor): Tensor image of size (C, H, W) to be normalized.
+
+        Returns:
+            Tensor: Normalized Tensor image.
+        """
+        if not _is_tensor_image(tensor):
+            raise TypeError('tensor is not a torch image.')
+        # TODO: make efficient
+        for t, m, s in zip(tensor, self.mean, self.std):
+            t.sub_(m).div_(s)
+        return tensor
+
+
+class Rotate(object):
+    """Rotates the given ``numpy.ndarray``.
+
+    Args:
+        angle (float): The rotation angle in degrees.
+    """
+
+    def __init__(self, angle):
+        self.angle = angle
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be rotated.
+
+        Returns:
+            img (numpy.ndarray (C x H x W)): Rotated image.
+        """
+
+        # order=0 means nearest-neighbor type interpolation
+        return skimage.transform.rotate(img, self.angle, resize=False, order=0)
+
+
+class Resize(object):
+    """Resize the the given ``numpy.ndarray`` to the given size.
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size)
+        interpolation (int, optional): Desired interpolation. Default is
+            ``PIL.Image.BILINEAR``
+    """
+
+    def __init__(self, size, interpolation='nearest'):
+        assert isinstance(size, float)
+        self.size = size
+        self.interpolation = interpolation
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be scaled.
+        Returns:
+            img (numpy.ndarray (C x H x W)): Rescaled image.
+        """
+        if img.ndim == 3:
+            return skimage.transform.rescale(img, self.size, order=0)
+        elif img.ndim == 2:
+            return skimage.transform.rescale(img, self.size, order=0)
+        else:
+            RuntimeError(
+                'img should be ndarray with 2 or 3 dimensions. Got {}'.format(
+                    img.ndim))
+
+
+class CenterCrop(object):
+    """Crops the given ``numpy.ndarray`` at the center.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for center crop.
+
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for center crop.
+        """
+        h = img.shape[0]
+        w = img.shape[1]
+        th, tw = output_size
+        i = int(round((h - th) / 2.))
+        j = int(round((w - tw) / 2.))
+
+        # # randomized cropping
+        # i = np.random.randint(i-3, i+4)
+        # j = np.random.randint(j-3, j+4)
+
+        return i, j, th, tw
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be cropped.
+
+        Returns:
+            img (numpy.ndarray (C x H x W)): Cropped image.
+        """
+        i, j, h, w = self.get_params(img, self.size)
+        """
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+        if img.ndim == 3:
+            return img[i:i + h, j:j + w, :]
+        elif img.ndim == 2:
+            return img[i:i + h, j:j + w]
+        else:
+            raise RuntimeError(
+                'img should be ndarray with 2 or 3 dimensions. Got {}'.format(
+                    img.ndim))
+
+
+class BottomCrop(object):
+    """Crops the given ``numpy.ndarray`` at the bottom.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+
+    def __init__(self, size):
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
+        else:
+            self.size = size
+
+    @staticmethod
+    def get_params(img, output_size):
+        """Get parameters for ``crop`` for bottom crop.
+
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for bottom crop.
+        """
+        h = img.shape[0]
+        w = img.shape[1]
+        th, tw = output_size
+        i = h - th
+        j = int(round((w - tw) / 2.))
+
+        # randomized left and right cropping
+        # i = np.random.randint(i-3, i+4)
+        # j = np.random.randint(j-1, j+1)
+
+        return i, j, th, tw
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be cropped.
+
+        Returns:
+            img (numpy.ndarray (C x H x W)): Cropped image.
+        """
+        i, j, h, w = self.get_params(img, self.size)
+        """
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+        if img.ndim == 3:
+            return img[i:i + h, j:j + w, :]
+        elif img.ndim == 2:
+            return img[i:i + h, j:j + w]
+        else:
+            raise RuntimeError(
+                'img should be ndarray with 2 or 3 dimensions. Got {}'.format(
+                    img.ndim))
+
+
+class Crop(object):
+    """Crops the given ``numpy.ndarray`` at the center.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made.
+    """
+
+    def __init__(self, crop):
+        self.crop = crop
+
+    @staticmethod
+    def get_params(img, crop):
+        """Get parameters for ``crop`` for center crop.
+
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be cropped.
+            output_size (tuple): Expected output size of the crop.
+
+        Returns:
+            tuple: params (i, j, h, w) to be passed to ``crop`` for center crop.
+        """
+        x_l, x_r, y_b, y_t = crop
+        h = img.shape[0]
+        w = img.shape[1]
+        assert x_l >= 0 and x_l < w
+        assert x_r >= 0 and x_r < w
+        assert y_b >= 0 and y_b < h
+        assert y_t >= 0 and y_t < h
+        assert x_l < x_r and y_b < y_t
+
+        return x_l, x_r, y_b, y_t
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be cropped.
+
+        Returns:
+            img (numpy.ndarray (C x H x W)): Cropped image.
+        """
+        x_l, x_r, y_b, y_t = self.get_params(img, self.crop)
+        """
+        i: Upper pixel coordinate.
+        j: Left pixel coordinate.
+        h: Height of the cropped image.
+        w: Width of the cropped image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+        if img.ndim == 3:
+            return img[y_b:y_t, x_l:x_r, :]
+        elif img.ndim == 2:
+            return img[y_b:y_t, x_l:x_r]
+        else:
+            raise RuntimeError(
+                'img should be ndarray with 2 or 3 dimensions. Got {}'.format(
+                    img.ndim))
+
+
+class Lambda(object):
+    """Apply a user-defined lambda as a transform.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img):
+        return self.lambd(img)
+
+
+class HorizontalFlip(object):
+    """Horizontally flip the given ``numpy.ndarray``.
+
+    Args:
+        do_flip (boolean): whether or not do horizontal flip.
+
+    """
+
+    def __init__(self, do_flip):
+        self.do_flip = do_flip
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Image to be flipped.
+
+        Returns:
+            img (numpy.ndarray (C x H x W)): flipped image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+
+        if self.do_flip:
+            return np.fliplr(img)
+        else:
+            return img
+
+
+class ColorJitter(object):
+    """Randomly change the brightness, contrast and saturation of an image.
+
+    Args:
+        brightness (float): How much to jitter brightness. brightness_factor
+            is chosen uniformly from [max(0, 1 - brightness), 1 + brightness].
+        contrast (float): How much to jitter contrast. contrast_factor
+            is chosen uniformly from [max(0, 1 - contrast), 1 + contrast].
+        saturation (float): How much to jitter saturation. saturation_factor
+            is chosen uniformly from [max(0, 1 - saturation), 1 + saturation].
+        hue(float): How much to jitter hue. hue_factor is chosen uniformly from
+            [-hue, hue]. Should be >=0 and <= 0.5.
+    """
+
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+        transforms = []
+        transforms.append(
+            Lambda(lambda img: adjust_brightness(img, brightness)))
+        transforms.append(Lambda(lambda img: adjust_contrast(img, contrast)))
+        transforms.append(
+            Lambda(lambda img: adjust_saturation(img, saturation)))
+        transforms.append(Lambda(lambda img: adjust_hue(img, hue)))
+        np.random.shuffle(transforms)
+        self.transform = Compose(transforms)
+
+    def __call__(self, img):
+        """
+        Args:
+            img (numpy.ndarray (C x H x W)): Input image.
+
+        Returns:
+            img (numpy.ndarray (C x H x W)): Color jittered image.
+        """
+        if not (_is_numpy_image(img)):
+            raise TypeError('img should be ndarray. Got {}'.format(type(img)))
+
+        pil = Image.fromarray(img)
+        return np.array(self.transform(pil))
diff --git a/modelscope/models/cv/self_supervised_depth_completion/helper.py b/modelscope/models/cv/self_supervised_depth_completion/helper.py
new file mode 100644
index 000000000..5a9069bdc
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/helper.py
@@ -0,0 +1,269 @@
+import csv
+import os
+import shutil
+import time
+
+import torch
+
+from modelscope.models.cv.self_supervised_depth_completion import vis_utils
+from modelscope.models.cv.self_supervised_depth_completion.metrics import \
+    Result
+
+fieldnames = [
+    'epoch', 'rmse', 'photo', 'mae', 'irmse', 'imae', 'mse', 'absrel', 'lg10',
+    'silog', 'squared_rel', 'delta1', 'delta2', 'delta3', 'data_time',
+    'gpu_time'
+]
+
+
+class logger:
+
+    def __init__(self, args, prepare=True):
+        self.args = args
+        output_directory = get_folder_name(args)
+        self.output_directory = output_directory
+        self.best_result = Result()
+        self.best_result.set_to_worst()
+
+        if not prepare:
+            return
+        if not os.path.exists(output_directory):
+            os.makedirs(output_directory)
+        self.train_csv = os.path.join(output_directory, 'train.csv')
+        self.val_csv = os.path.join(output_directory, 'val.csv')
+        self.best_txt = os.path.join(output_directory, 'best.txt')
+
+        # backup the source code
+        if args.resume == '':
+            print('=> creating source code backup ...')
+            backup_directory = os.path.join(output_directory, 'code_backup')
+            self.backup_directory = backup_directory
+            # backup_source_code(backup_directory)
+            # create new csv files with only header
+            with open(self.train_csv, 'w') as csvfile:
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+                writer.writeheader()
+            with open(self.val_csv, 'w') as csvfile:
+                writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+                writer.writeheader()
+            print('=> finished creating source code backup.')
+
+    def conditional_print(self, split, i, epoch, lr, n_set, blk_avg_meter,
+                          avg_meter):
+        if (i + 1) % self.args.print_freq == 0:
+            avg = avg_meter.average()
+            blk_avg = blk_avg_meter.average()
+            print('=> output: {}'.format(self.output_directory))
+            print(
+                '{split} Epoch: {0} [{1}/{2}]\tlr={lr} '
+                't_Data={blk_avg.data_time:.3f}({average.data_time:.3f}) '
+                't_GPU={blk_avg.gpu_time:.3f}({average.gpu_time:.3f})\n\t'
+                'RMSE={blk_avg.rmse:.2f}({average.rmse:.2f}) '
+                'MAE={blk_avg.mae:.2f}({average.mae:.2f}) '
+                'iRMSE={blk_avg.irmse:.2f}({average.irmse:.2f}) '
+                'iMAE={blk_avg.imae:.2f}({average.imae:.2f})\n\t'
+                'silog={blk_avg.silog:.2f}({average.silog:.2f}) '
+                'squared_rel={blk_avg.squared_rel:.2f}({average.squared_rel:.2f}) '
+                'Delta1={blk_avg.delta1:.3f}({average.delta1:.3f}) '
+                'REL={blk_avg.absrel:.3f}({average.absrel:.3f})\n\t'
+                'Lg10={blk_avg.lg10:.3f}({average.lg10:.3f}) '
+                'Photometric={blk_avg.photometric:.3f}({average.photometric:.3f}) '
+                .format(
+                    epoch,
+                    i + 1,
+                    n_set,
+                    lr=lr,
+                    blk_avg=blk_avg,
+                    average=avg,
+                    split=split.capitalize()))
+            blk_avg_meter.reset()
+
+    def conditional_save_info(self, split, average_meter, epoch):
+        avg = average_meter.average()
+        if split == 'train':
+            csvfile_name = self.train_csv
+        elif split == 'val':
+            csvfile_name = self.val_csv
+        elif split == 'eval':
+            eval_filename = os.path.join(self.output_directory, 'eval.txt')
+            self.save_single_txt(eval_filename, avg, epoch)
+            return avg
+        elif 'test' in split:
+            return avg
+        else:
+            raise ValueError('wrong split provided to logger')
+        with open(csvfile_name, 'a') as csvfile:
+            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+            writer.writerow({
+                'epoch': epoch,
+                'rmse': avg.rmse,
+                'photo': avg.photometric,
+                'mae': avg.mae,
+                'irmse': avg.irmse,
+                'imae': avg.imae,
+                'mse': avg.mse,
+                'silog': avg.silog,
+                'squared_rel': avg.squared_rel,
+                'absrel': avg.absrel,
+                'lg10': avg.lg10,
+                'delta1': avg.delta1,
+                'delta2': avg.delta2,
+                'delta3': avg.delta3,
+                'gpu_time': avg.gpu_time,
+                'data_time': avg.data_time
+            })
+        return avg
+
+    def save_single_txt(self, filename, result, epoch):
+        with open(filename, 'w') as txtfile:
+            txtfile.write(
+                ('rank_metric={}\n' + 'epoch={}\n' + 'rmse={:.3f}\n'
+                 + 'mae={:.3f}\n' + 'silog={:.3f}\n' + 'squared_rel={:.3f}\n'
+                 + 'irmse={:.3f}\n' + 'imae={:.3f}\n' + 'mse={:.3f}\n'
+                 + 'absrel={:.3f}\n' + 'lg10={:.3f}\n'
+                 + 'delta1={:.3f}\n' + 't_gpu={:.4f}').format(
+                     self.args.rank_metric, epoch, result.rmse, result.mae,
+                     result.silog, result.squared_rel, result.irmse,
+                     result.imae, result.mse, result.absrel, result.lg10,
+                     result.delta1, result.gpu_time))
+
+    def save_best_txt(self, result, epoch):
+        self.save_single_txt(self.best_txt, result, epoch)
+
+    def _get_img_comparison_name(self, mode, epoch, is_best=False):
+        if mode == 'eval':
+            return self.output_directory + '/comparison_eval.png'
+        if mode == 'val':
+            if is_best:
+                return self.output_directory + '/comparison_best.png'
+            else:
+                return self.output_directory + '/comparison_' + str(
+                    epoch) + '.png'
+
+    def conditional_save_img_comparison(self, mode, i, ele, pred, epoch):
+        # save 8 images for visualization
+        if mode == 'val' or mode == 'eval':
+            skip = 100
+            if i == 0:
+                self.img_merge = vis_utils.merge_into_row(ele, pred)
+            elif i % skip == 0 and i < 8 * skip:
+                row = vis_utils.merge_into_row(ele, pred)
+                self.img_merge = vis_utils.add_row(self.img_merge, row)
+            elif i == 8 * skip:
+                filename = self._get_img_comparison_name(mode, epoch)
+                vis_utils.save_image(self.img_merge, filename)
+        return self.img_merge
+
+    def save_img_comparison_as_best(self, mode, epoch):
+        if mode == 'val':
+            filename = self._get_img_comparison_name(mode, epoch, is_best=True)
+            vis_utils.save_image(self.img_merge, filename)
+
+    def get_ranking_error(self, result):
+        return getattr(result, self.args.rank_metric)
+
+    def rank_conditional_save_best(self, mode, result, epoch):
+        error = self.get_ranking_error(result)
+        best_error = self.get_ranking_error(self.best_result)
+        is_best = error < best_error
+        if is_best and mode == 'val':
+            self.old_best_result = self.best_result
+            self.best_result = result
+            self.save_best_txt(result, epoch)
+        return is_best
+
+    def conditional_save_pred(self, mode, i, pred, epoch):
+        if ('test' in mode or mode == 'eval') and self.args.save_pred:
+
+            # save images for visualization/ testing
+            image_folder = os.path.join(self.output_directory,
+                                        mode + '_output')
+            if not os.path.exists(image_folder):
+                os.makedirs(image_folder)
+            img = torch.squeeze(pred.data.cpu()).numpy()
+            filename = os.path.join(image_folder, '{0:010d}.png'.format(i))
+            vis_utils.save_depth_as_uint16png(img, filename)
+
+    def conditional_summarize(self, mode, avg, is_best):
+        print('\n*\nSummary of ', mode, 'round')
+        print(''
+              'RMSE={average.rmse:.3f}\n'
+              'MAE={average.mae:.3f}\n'
+              'Photo={average.photometric:.3f}\n'
+              'iRMSE={average.irmse:.3f}\n'
+              'iMAE={average.imae:.3f}\n'
+              'squared_rel={average.squared_rel}\n'
+              'silog={average.silog}\n'
+              'Delta1={average.delta1:.3f}\n'
+              'REL={average.absrel:.3f}\n'
+              'Lg10={average.lg10:.3f}\n'
+              't_GPU={time:.3f}'.format(average=avg, time=avg.gpu_time))
+        if is_best and mode == 'val':
+            print('New best model by %s (was %.3f)' %
+                  (self.args.rank_metric,
+                   self.get_ranking_error(self.old_best_result)))
+        elif mode == 'val':
+            print('(best %s is %.3f)' %
+                  (self.args.rank_metric,
+                   self.get_ranking_error(self.best_result)))
+        print('*\n')
+
+
+ignore_hidden = shutil.ignore_patterns('.', '..', '.git*', '*pycache*',
+                                       '*build', '*.fuse*', '*_drive_*')
+
+
+def backup_source_code(backup_directory):
+    if os.path.exists(backup_directory):
+        shutil.rmtree(backup_directory)
+    shutil.copytree('.', backup_directory, ignore=ignore_hidden)
+
+
+def adjust_learning_rate(lr_init, optimizer, epoch):
+    """Sets the learning rate to the initial LR decayed by 10 every 5 epochs"""
+    lr = lr_init * (0.1**(epoch // 5))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+    return lr
+
+
+def save_checkpoint(state, is_best, epoch, output_directory):
+    checkpoint_filename = os.path.join(output_directory,
+                                       'checkpoint-' + str(epoch) + '.pth.tar')
+    torch.save(state, checkpoint_filename)
+    if is_best:
+        best_filename = os.path.join(output_directory, 'model_best.pth.tar')
+        shutil.copyfile(checkpoint_filename, best_filename)
+    if epoch > 0:
+        prev_checkpoint_filename = os.path.join(
+            output_directory, 'checkpoint-' + str(epoch - 1) + '.pth.tar')
+        if os.path.exists(prev_checkpoint_filename):
+            os.remove(prev_checkpoint_filename)
+
+
+def get_folder_name(args):
+    # current_time = time.strftime('%Y-%m-%d@%H-%M')
+    # if args.use_pose:
+    #     prefix = 'mode={}.w1={}.w2={}.'.format(args.train_mode, args.w1,
+    #                                            args.w2)
+    # else:
+    #     prefix = 'mode={}.'.format(args.train_mode)
+    # return os.path.join(args.result,
+    #     prefix + 'input={}.resnet{}.criterion={}.lr={}.bs={}.wd={}.pretrained={}.jitter={}.time={}'.
+    #     format(args.input, args.layers, args.criterion, \
+    #         args.lr, args.batch_size, args.weight_decay, \
+    #         args.pretrained, args.jitter, current_time
+    #         ))
+    return os.path.join(args.result, 'test')
+
+
+avgpool = torch.nn.AvgPool2d(kernel_size=2, stride=2).cuda()
+
+
+def multiscale(img):
+    img1 = avgpool(img)
+    img2 = avgpool(img1)
+    img3 = avgpool(img2)
+    img4 = avgpool(img3)
+    img5 = avgpool(img4)
+    return img5, img4, img3, img2, img1
diff --git a/modelscope/models/cv/self_supervised_depth_completion/inverse_warp.py b/modelscope/models/cv/self_supervised_depth_completion/inverse_warp.py
new file mode 100644
index 000000000..08963fc9c
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/inverse_warp.py
@@ -0,0 +1,141 @@
+import torch
+import torch.nn.functional as F
+
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class Intrinsics:
+    """Intrinsics"""
+
+    def __init__(self, width, height, fu, fv, cu=0, cv=0):
+        self.height, self.width = height, width
+        self.fu, self.fv = fu, fv  # fu, fv: focal length along the horizontal and vertical axes
+
+        # cu, cv: optical center along the horizontal and vertical axes
+        self.cu = cu if cu > 0 else (width - 1) / 2.0
+        self.cv = cv if cv > 0 else (height - 1) / 2.0
+
+        # U, V represent the homogeneous horizontal and vertical coordinates in the pixel space
+        self.U = torch.arange(start=0, end=width).expand(height, width).float()
+        self.V = torch.arange(
+            start=0, end=height).expand(width, height).t().float()
+
+        # X_cam, Y_cam represent the homogeneous x, y coordinates (assuming depth z=1) in the camera coordinate system
+        self.X_cam = (self.U - self.cu) / self.fu
+        self.Y_cam = (self.V - self.cv) / self.fv
+
+        self.is_cuda = False
+
+    def cuda(self):
+        self.X_cam.data = self.X_cam.data.cuda()
+        self.Y_cam.data = self.Y_cam.data.cuda()
+        self.is_cuda = True
+        return self
+
+    def scale(self, height, width):
+        # return a new set of corresponding intrinsic parameters for the scaled image
+        ratio_u = float(width) / self.width
+        ratio_v = float(height) / self.height
+        fu = ratio_u * self.fu
+        fv = ratio_v * self.fv
+        cu = ratio_u * self.cu
+        cv = ratio_v * self.cv
+        new_intrinsics = Intrinsics(width, height, fu, fv, cu, cv)
+        if self.is_cuda:
+            new_intrinsics.cuda()
+        return new_intrinsics
+
+    def __print__(self):
+        logger.info(
+            'size=({},{})\nfocal length=({},{})\noptical center=({},{})'.
+            format(self.height, self.width, self.fv, self.fu, self.cv,
+                   self.cu))
+
+
+def image_to_pointcloud(depth, intrinsics):
+    assert depth.dim() == 4
+    assert depth.size(1) == 1
+
+    X = depth * intrinsics.X_cam
+    Y = depth * intrinsics.Y_cam
+    return torch.cat((X, Y, depth), dim=1)
+
+
+def pointcloud_to_image(pointcloud, intrinsics):
+    assert pointcloud.dim() == 4
+
+    batch_size = pointcloud.size(0)
+    X = pointcloud[:, 0, :, :]  # .view(batch_size, -1)
+    Y = pointcloud[:, 1, :, :]  # .view(batch_size, -1)
+    Z = pointcloud[:, 2, :, :].clamp(min=1e-3)  # .view(batch_size, -1)
+
+    # compute pixel coordinates
+    U_proj = intrinsics.fu * X / Z + intrinsics.cu  # horizontal pixel coordinate
+    V_proj = intrinsics.fv * Y / Z + intrinsics.cv  # vertical pixel coordinate
+
+    # normalization to [-1, 1], required by torch.nn.functional.grid_sample
+    w = intrinsics.width
+    h = intrinsics.height
+    U_proj_normalized = (2 * U_proj / (w - 1) - 1).view(batch_size, -1)
+    V_proj_normalized = (2 * V_proj / (h - 1) - 1).view(batch_size, -1)
+
+    # This was important since PyTorch didn't do as it claimed for points out of boundary
+    # See https://github.com/ClementPinard/SfmLearner-Pytorch/blob/master/inverse_warp.py
+    # Might not be necessary any more
+    U_proj_mask = ((U_proj_normalized > 1) + (U_proj_normalized < -1)).detach()
+    U_proj_normalized[U_proj_mask] = 2
+    V_proj_mask = ((V_proj_normalized > 1) + (V_proj_normalized < -1)).detach()
+    V_proj_normalized[V_proj_mask] = 2
+
+    pixel_coords = torch.stack([U_proj_normalized, V_proj_normalized],
+                               dim=2)  # [B, H*W, 2]
+    return pixel_coords.view(batch_size, intrinsics.height, intrinsics.width,
+                             2)
+
+
+def batch_multiply(batch_scalar, batch_matrix):
+    # input: batch_scalar of size b, batch_matrix of size b * 3 * 3
+    # output: batch_matrix of size b * 3 * 3
+    batch_size = batch_scalar.size(0)
+    output = batch_matrix.clone()
+    for i in range(batch_size):
+        output[i] = batch_scalar[i] * batch_matrix[i]
+    return output
+
+
+def transform_curr_to_near(pointcloud_curr, r_mat, t_vec, intrinsics):
+    # translation and rotmat represent the transformation from tgt pose to src pose
+    batch_size = pointcloud_curr.size(0)
+    XYZ_ = torch.bmm(r_mat, pointcloud_curr.view(batch_size, 3, -1))
+
+    X = (XYZ_[:, 0, :] + t_vec[:, 0].unsqueeze(1)).view(
+        -1, 1, intrinsics.height, intrinsics.width)
+    Y = (XYZ_[:, 1, :] + t_vec[:, 1].unsqueeze(1)).view(
+        -1, 1, intrinsics.height, intrinsics.width)
+    Z = (XYZ_[:, 2, :] + t_vec[:, 2].unsqueeze(1)).view(
+        -1, 1, intrinsics.height, intrinsics.width)
+
+    pointcloud_near = torch.cat((X, Y, Z), dim=1)
+
+    return pointcloud_near
+
+
+def homography_from(rgb_near, depth_curr, r_mat, t_vec, intrinsics):
+    # inverse warp the RGB image from the nearby frame to the current frame
+
+    # to ensure dimension consistency
+    r_mat = r_mat.view(-1, 3, 3)
+    t_vec = t_vec.view(-1, 3)
+
+    # compute source pixel coordinate
+    pointcloud_curr = image_to_pointcloud(depth_curr, intrinsics)
+    pointcloud_near = transform_curr_to_near(pointcloud_curr, r_mat, t_vec,
+                                             intrinsics)
+    pixel_coords_near = pointcloud_to_image(pointcloud_near, intrinsics)
+
+    # the warping
+    warped = F.grid_sample(rgb_near, pixel_coords_near)
+
+    return warped
diff --git a/modelscope/models/cv/self_supervised_depth_completion/metrics.py b/modelscope/models/cv/self_supervised_depth_completion/metrics.py
new file mode 100644
index 000000000..58bb9d5f2
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/metrics.py
@@ -0,0 +1,181 @@
+import math
+
+import numpy as np
+import torch
+
+lg_e_10 = math.log(10)
+
+
+def log10(x):
+    """Convert a new tensor with the base-10 logarithm of the elements of x. """
+    return torch.log(x) / lg_e_10
+
+
+class Result(object):
+    """Result"""
+
+    def __init__(self):
+        self.irmse = 0
+        self.imae = 0
+        self.mse = 0
+        self.rmse = 0
+        self.mae = 0
+        self.absrel = 0
+        self.squared_rel = 0
+        self.lg10 = 0
+        self.delta1 = 0
+        self.delta2 = 0
+        self.delta3 = 0
+        self.data_time = 0
+        self.gpu_time = 0
+        self.silog = 0  # Scale invariant logarithmic error [log(m)*100]
+        self.photometric = 0
+
+    def set_to_worst(self):
+        self.irmse = np.inf
+        self.imae = np.inf
+        self.mse = np.inf
+        self.rmse = np.inf
+        self.mae = np.inf
+        self.absrel = np.inf
+        self.squared_rel = np.inf
+        self.lg10 = np.inf
+        self.silog = np.inf
+        self.delta1 = 0
+        self.delta2 = 0
+        self.delta3 = 0
+        self.data_time = 0
+        self.gpu_time = 0
+
+    def update(self,
+               irmse,
+               imae,
+               mse,
+               rmse,
+               mae,
+               absrel,
+               squared_rel,
+               lg10,
+               delta1,
+               delta2,
+               delta3,
+               gpu_time,
+               data_time,
+               silog,
+               photometric=0):
+        """update"""
+        self.irmse = irmse
+        self.imae = imae
+        self.mse = mse
+        self.rmse = rmse
+        self.mae = mae
+        self.absrel = absrel
+        self.squared_rel = squared_rel
+        self.lg10 = lg10
+        self.delta1 = delta1
+        self.delta2 = delta2
+        self.delta3 = delta3
+        self.data_time = data_time
+        self.gpu_time = gpu_time
+        self.silog = silog
+        self.photometric = photometric
+
+    def evaluate(self, output, target, photometric=0):
+        """evaluate"""
+        valid_mask = target > 0.1
+
+        # convert from meters to mm
+        output_mm = 1e3 * output[valid_mask]
+        target_mm = 1e3 * target[valid_mask]
+
+        abs_diff = (output_mm - target_mm).abs()
+
+        self.mse = float((torch.pow(abs_diff, 2)).mean())
+        self.rmse = math.sqrt(self.mse)
+        self.mae = float(abs_diff.mean())
+        self.lg10 = float((log10(output_mm) - log10(target_mm)).abs().mean())
+        self.absrel = float((abs_diff / target_mm).mean())
+        self.squared_rel = float(((abs_diff / target_mm)**2).mean())
+
+        maxRatio = torch.max(output_mm / target_mm, target_mm / output_mm)
+        self.delta1 = float((maxRatio < 1.25).float().mean())
+        self.delta2 = float((maxRatio < 1.25**2).float().mean())
+        self.delta3 = float((maxRatio < 1.25**3).float().mean())
+        self.data_time = 0
+        self.gpu_time = 0
+
+        # silog uses meters
+        err_log = torch.log(target[valid_mask]) - torch.log(output[valid_mask])
+        normalized_squared_log = (err_log**2).mean()
+        log_mean = err_log.mean()
+        self.silog = math.sqrt(normalized_squared_log
+                               - log_mean * log_mean) * 100
+
+        # convert from meters to km
+        inv_output_km = (1e-3 * output[valid_mask])**(-1)
+        inv_target_km = (1e-3 * target[valid_mask])**(-1)
+        abs_inv_diff = (inv_output_km - inv_target_km).abs()
+        self.irmse = math.sqrt((torch.pow(abs_inv_diff, 2)).mean())
+        self.imae = float(abs_inv_diff.mean())
+
+        self.photometric = float(photometric)
+
+
+class AverageMeter(object):
+    """AverageMeter"""
+
+    def __init__(self):
+        self.reset()
+
+    def reset(self):
+        """reset"""
+        self.count = 0.0
+        self.sum_irmse = 0
+        self.sum_imae = 0
+        self.sum_mse = 0
+        self.sum_rmse = 0
+        self.sum_mae = 0
+        self.sum_absrel = 0
+        self.sum_squared_rel = 0
+        self.sum_lg10 = 0
+        self.sum_delta1 = 0
+        self.sum_delta2 = 0
+        self.sum_delta3 = 0
+        self.sum_data_time = 0
+        self.sum_gpu_time = 0
+        self.sum_photometric = 0
+        self.sum_silog = 0
+
+    def update(self, result, gpu_time, data_time, n=1):
+        """update"""
+        self.count += n
+        self.sum_irmse += n * result.irmse
+        self.sum_imae += n * result.imae
+        self.sum_mse += n * result.mse
+        self.sum_rmse += n * result.rmse
+        self.sum_mae += n * result.mae
+        self.sum_absrel += n * result.absrel
+        self.sum_squared_rel += n * result.squared_rel
+        self.sum_lg10 += n * result.lg10
+        self.sum_delta1 += n * result.delta1
+        self.sum_delta2 += n * result.delta2
+        self.sum_delta3 += n * result.delta3
+        self.sum_data_time += n * data_time
+        self.sum_gpu_time += n * gpu_time
+        self.sum_silog += n * result.silog
+        self.sum_photometric += n * result.photometric
+
+    def average(self):
+        """average"""
+        avg = Result()
+        if self.count > 0:
+            avg.update(
+                self.sum_irmse / self.count, self.sum_imae / self.count,
+                self.sum_mse / self.count, self.sum_rmse / self.count,
+                self.sum_mae / self.count, self.sum_absrel / self.count,
+                self.sum_squared_rel / self.count, self.sum_lg10 / self.count,
+                self.sum_delta1 / self.count, self.sum_delta2 / self.count,
+                self.sum_delta3 / self.count, self.sum_gpu_time / self.count,
+                self.sum_data_time / self.count, self.sum_silog / self.count,
+                self.sum_photometric / self.count)
+        return avg
diff --git a/modelscope/models/cv/self_supervised_depth_completion/model.py b/modelscope/models/cv/self_supervised_depth_completion/model.py
new file mode 100644
index 000000000..2a56b3178
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/model.py
@@ -0,0 +1,215 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.models import resnet
+
+
+def init_weights(m):
+    """init_weights"""
+    if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
+        m.weight.data.normal_(0, 1e-3)
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.ConvTranspose2d):
+        m.weight.data.normal_(0, 1e-3)
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1)
+        m.bias.data.zero_()
+
+
+def conv_bn_relu(in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 bn=True,
+                 relu=True):
+    """conv_bn_relu"""
+    bias = not bn
+    layers = []
+    layers.append(
+        nn.Conv2d(
+            in_channels, out_channels, kernel_size, stride, padding,
+            bias=bias))
+    if bn:
+        layers.append(nn.BatchNorm2d(out_channels))
+    if relu:
+        layers.append(nn.LeakyReLU(0.2, inplace=True))
+    layers = nn.Sequential(*layers)
+
+    # initialize the weights
+    for m in layers.modules():
+        init_weights(m)
+
+    return layers
+
+
+def convt_bn_relu(in_channels,
+                  out_channels,
+                  kernel_size,
+                  stride=1,
+                  padding=0,
+                  output_padding=0,
+                  bn=True,
+                  relu=True):
+    """convt_bn_relu"""
+    bias = not bn
+    layers = []
+    layers.append(
+        nn.ConvTranspose2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            output_padding,
+            bias=bias))
+    if bn:
+        layers.append(nn.BatchNorm2d(out_channels))
+    if relu:
+        layers.append(nn.LeakyReLU(0.2, inplace=True))
+    layers = nn.Sequential(*layers)
+
+    # initialize the weights
+    for m in layers.modules():
+        init_weights(m)
+
+    return layers
+
+
+class DepthCompletionNet(nn.Module):
+    """DepthCompletionNet"""
+
+    def __init__(self, args):
+        assert (
+            args.layers in [18, 34, 50, 101, 152]
+        ), f'Only layers 18, 34, 50, 101, and 152 are defined, but got {layers}'.format(
+            layers)
+        super(DepthCompletionNet, self).__init__()
+        self.modality = args.input
+
+        if 'd' in self.modality:
+            channels = 64 // len(self.modality)
+            self.conv1_d = conv_bn_relu(
+                1, channels, kernel_size=3, stride=1, padding=1)
+        if 'rgb' in self.modality:
+            channels = 64 * 3 // len(self.modality)
+            self.conv1_img = conv_bn_relu(
+                3, channels, kernel_size=3, stride=1, padding=1)
+        elif 'g' in self.modality:
+            channels = 64 // len(self.modality)
+            self.conv1_img = conv_bn_relu(
+                1, channels, kernel_size=3, stride=1, padding=1)
+
+        pretrained_model = resnet.__dict__['resnet{}'.format(args.layers)](
+            pretrained=args.pretrained)
+        if not args.pretrained:
+            pretrained_model.apply(init_weights)
+        # self.maxpool = pretrained_model._modules['maxpool']
+        self.conv2 = pretrained_model._modules['layer1']
+        self.conv3 = pretrained_model._modules['layer2']
+        self.conv4 = pretrained_model._modules['layer3']
+        self.conv5 = pretrained_model._modules['layer4']
+        del pretrained_model  # clear memory
+
+        # define number of intermediate channels
+        if args.layers <= 34:
+            num_channels = 512
+        elif args.layers >= 50:
+            num_channels = 2048
+        self.conv6 = conv_bn_relu(
+            num_channels, 512, kernel_size=3, stride=2, padding=1)
+
+        # decoding layers
+        kernel_size = 3
+        stride = 2
+        self.convt5 = convt_bn_relu(
+            in_channels=512,
+            out_channels=256,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            output_padding=1)
+        self.convt4 = convt_bn_relu(
+            in_channels=768,
+            out_channels=128,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            output_padding=1)
+        self.convt3 = convt_bn_relu(
+            in_channels=(256 + 128),
+            out_channels=64,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            output_padding=1)
+        self.convt2 = convt_bn_relu(
+            in_channels=(128 + 64),
+            out_channels=64,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=1,
+            output_padding=1)
+        self.convt1 = convt_bn_relu(
+            in_channels=128,
+            out_channels=64,
+            kernel_size=kernel_size,
+            stride=1,
+            padding=1)
+        self.convtf = conv_bn_relu(
+            in_channels=128,
+            out_channels=1,
+            kernel_size=1,
+            stride=1,
+            bn=False,
+            relu=False)
+
+    def forward(self, x):
+        """forward"""
+        # first layer
+        if 'd' in self.modality:
+            conv1_d = self.conv1_d(x['d'])
+        if 'rgb' in self.modality:
+            conv1_img = self.conv1_img(x['rgb'])
+        elif 'g' in self.modality:
+            conv1_img = self.conv1_img(x['g'])
+
+        if self.modality == 'rgbd' or self.modality == 'gd':
+            conv1 = torch.cat((conv1_d, conv1_img), 1)
+        else:
+            conv1 = conv1_d if (self.modality == 'd') else conv1_img
+
+        conv2 = self.conv2(conv1)
+        conv3 = self.conv3(conv2)  # batchsize * ? * 176 * 608
+        conv4 = self.conv4(conv3)  # batchsize * ? * 88 * 304
+        conv5 = self.conv5(conv4)  # batchsize * ? * 44 * 152
+        conv6 = self.conv6(conv5)  # batchsize * ? * 22 * 76
+
+        # decoder
+        convt5 = self.convt5(conv6)
+        y = torch.cat((convt5, conv5), 1)
+
+        convt4 = self.convt4(y)
+        y = torch.cat((convt4, conv4), 1)
+
+        convt3 = self.convt3(y)
+        y = torch.cat((convt3, conv3), 1)
+
+        convt2 = self.convt2(y)
+        y = torch.cat((convt2, conv2), 1)
+
+        convt1 = self.convt1(y)
+        y = torch.cat((convt1, conv1), 1)
+
+        y = self.convtf(y)
+
+        if self.training:
+            return 100 * y
+        else:
+            min_distance = 0.9
+            return F.relu(
+                100 * y - min_distance
+            ) + min_distance  # the minimum range of Velodyne is around 3 feet ~= 0.9m
diff --git a/modelscope/models/cv/self_supervised_depth_completion/self_supervised_depth_completion.py b/modelscope/models/cv/self_supervised_depth_completion/self_supervised_depth_completion.py
new file mode 100644
index 000000000..4e7046f6b
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/self_supervised_depth_completion.py
@@ -0,0 +1,225 @@
+# import argparse
+import os
+import sys
+import time
+# import mmcv
+from argparse import ArgumentParser
+# import torchvision
+from os import makedirs
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.parallel
+import torch.optim
+import torch.utils.data
+from tqdm import tqdm
+
+from modelscope.metainfo import Models
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.models.cv.self_supervised_depth_completion import (criteria,
+                                                                   helper)
+from modelscope.models.cv.self_supervised_depth_completion.dataloaders.kitti_loader import (
+    KittiDepth, input_options, load_calib, oheight, owidth)
+from modelscope.models.cv.self_supervised_depth_completion.inverse_warp import (
+    Intrinsics, homography_from)
+from modelscope.models.cv.self_supervised_depth_completion.metrics import (
+    AverageMeter, Result)
+from modelscope.models.cv.self_supervised_depth_completion.model import \
+    DepthCompletionNet
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+
+# from modelscope.utils.config import Config
+
+m_logger = get_logger()
+
+
+class ArgsList():
+    """ArgsList Class"""
+
+    def __init__(self) -> None:
+        self.workers = 4
+        self.epochs = 11
+        self.start_epoch = 0
+        self.criterion = 'l2'
+        self.batch_size = 1
+        self.learning_rate = 1e-5
+        self.weight_decay = 0
+        self.print_freq = 10
+        self.resume = ''
+        self.data_folder = '../data'
+        self.input = 'gd'
+        self.layers = 34
+        self.pretrained = True
+        self.val = 'select'
+        self.jitter = 0.1
+        self.rank_metric = 'rmse'
+        self.evaluate = ''
+        self.cpu = False
+
+
+@MODELS.register_module(
+    Tasks.self_supervised_depth_completion,
+    module_name=Models.self_supervised_depth_completion)
+class SelfSupervisedDepthCompletion(TorchModel):
+    """SelfSupervisedDepthCompletion Class"""
+
+    def __init__(self, model_dir: str, **kwargs):
+        """str -- model file root."""
+        super().__init__(model_dir, **kwargs)
+
+        args = ArgsList()
+        # define loss functions
+        self.depth_criterion = criteria.MaskedMSELoss()
+        self.photometric_criterion = criteria.PhotometricLoss()
+        self.smoothness_criterion = criteria.SmoothnessLoss()
+
+        # args.use_pose = ('photo' in args.train_mode)
+        args.use_pose = True
+        # args.pretrained = not args.no_pretrained
+        args.use_rgb = ('rgb' in args.input) or args.use_pose
+        args.use_d = 'd' in args.input
+        args.use_g = 'g' in args.input
+
+        args.evaluate = os.path.join(self.model_dir, 'model_best.pth')
+
+        if args.use_pose:
+            args.w1, args.w2 = 0.1, 0.1
+        else:
+            args.w1, args.w2 = 0, 0
+
+        self.cuda = torch.cuda.is_available() and not args.cpu
+        if self.cuda:
+            import torch.backends.cudnn as cudnn
+            cudnn.benchmark = True
+            self.device = torch.device('cuda')
+        else:
+            self.device = torch.device('cpu')
+        print("=> using '{}' for computation.".format(self.device))
+
+        args_new = args
+        if os.path.isfile(args.evaluate):
+            print(
+                "=> loading checkpoint '{}' ... ".format(args.evaluate),
+                end='')
+            self.checkpoint = torch.load(
+                args.evaluate, map_location=self.device)
+            args = self.checkpoint['args']
+            args.val = args_new.val
+            print('Completed.')
+        else:
+            print("No model found at '{}'".format(args.evaluate))
+            return
+
+        print('=> creating model and optimizer ... ', end='')
+        model = DepthCompletionNet(args).to(self.device)
+        model_named_params = [
+            p for _, p in model.named_parameters() if p.requires_grad
+        ]
+        optimizer = torch.optim.Adam(
+            model_named_params, lr=args.lr, weight_decay=args.weight_decay)
+        print('completed.')
+        if self.checkpoint is not None:
+            model.load_state_dict(self.checkpoint['model'])
+            optimizer.load_state_dict(self.checkpoint['optimizer'])
+            print('=> checkpoint state loaded.')
+
+        model = torch.nn.DataParallel(model)
+
+        self.model = model
+        self.args = args
+
+    def iterate(self, mode, args, loader, model, optimizer, logger, epoch):
+        """iterate data"""
+        block_average_meter = AverageMeter()
+        average_meter = AverageMeter()
+        meters = [block_average_meter, average_meter]
+        merged_img = None
+        # switch to appropriate mode
+        assert mode in ['train', 'val', 'eval', 'test_prediction', 'test_completion'], \
+            'unsupported mode: {}'.format(mode)
+        model.eval()
+        lr = 0
+
+        for i, batch_data in enumerate(loader):
+            start = time.time()
+            batch_data = {
+                key: val.to(self.device)
+                for key, val in batch_data.items() if val is not None
+            }
+            gt = batch_data[
+                'gt'] if mode != 'test_prediction' and mode != 'test_completion' else None
+            data_time = time.time() - start
+
+            start = time.time()
+            pred = model(batch_data)
+            photometric_loss = 0
+            gpu_time = time.time() - start
+
+            # measure accuracy and record loss
+            with torch.no_grad():
+                mini_batch_size = next(iter(batch_data.values())).size(0)
+                result = Result()
+                if mode != 'test_prediction' and mode != 'test_completion':
+                    result.evaluate(pred.data, gt.data, photometric_loss)
+                [
+                    m.update(result, gpu_time, data_time, mini_batch_size)
+                    for m in meters
+                ]
+                logger.conditional_print(mode, i, epoch, lr, len(loader),
+                                         block_average_meter, average_meter)
+                merged_img = logger.conditional_save_img_comparison(
+                    mode, i, batch_data, pred, epoch)
+                merged_img = cv2.cvtColor(merged_img, cv2.COLOR_RGB2BGR)
+                logger.conditional_save_pred(mode, i, pred, epoch)
+
+        avg = logger.conditional_save_info(mode, average_meter, epoch)
+        is_best = logger.rank_conditional_save_best(mode, avg, epoch)
+        logger.save_img_comparison_as_best(mode, epoch)
+        logger.conditional_summarize(mode, avg, is_best)
+
+        return avg, is_best, merged_img
+
+    def forward(self, source_dir):
+        """main function"""
+
+        args = self.args
+        args.data_folder = source_dir
+        args.result = os.path.join(args.data_folder, 'results')
+        if args.use_pose:
+            # hard-coded KITTI camera intrinsics
+            K = load_calib(args)
+            fu, fv = float(K[0, 0]), float(K[1, 1])
+            cu, cv = float(K[0, 2]), float(K[1, 2])
+            kitti_intrinsics = Intrinsics(owidth, oheight, fu, fv, cu, cv)
+            if self.cuda:
+                kitti_intrinsics = kitti_intrinsics.cuda()
+
+        # Data loading code
+        print('=> creating data loaders ... ')
+        val_dataset = KittiDepth('val', self.args)
+        val_loader = torch.utils.data.DataLoader(
+            val_dataset,
+            batch_size=1,
+            shuffle=False,
+            num_workers=2,
+            pin_memory=True)  # set batch size to be 1 for validation
+        print('\t==> val_loader size:{}'.format(len(val_loader)))
+
+        # create backups and results folder
+        logger = helper.logger(self.args)
+        if self.checkpoint is not None:
+            logger.best_result = self.checkpoint['best_result']
+
+        print('=> starting model evaluation ...')
+        result, is_best, merged_img = self.iterate('val', self.args,
+                                                   val_loader, self.model,
+                                                   None, logger,
+                                                   self.checkpoint['epoch'])
+        return merged_img
diff --git a/modelscope/models/cv/self_supervised_depth_completion/vis_utils.py b/modelscope/models/cv/self_supervised_depth_completion/vis_utils.py
new file mode 100644
index 000000000..38dfa43fa
--- /dev/null
+++ b/modelscope/models/cv/self_supervised_depth_completion/vis_utils.py
@@ -0,0 +1,119 @@
+import os
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+from PIL import Image
+
+if not ('DISPLAY' in os.environ):
+    import matplotlib as mpl
+    mpl.use('Agg')
+
+cmap = plt.cm.jet
+
+
+def depth_colorize(depth):
+    depth = (depth - np.min(depth)) / (np.max(depth) - np.min(depth))
+    depth = 255 * cmap(depth)[:, :, :3]  # H, W, C
+    return depth.astype('uint8')
+
+
+def merge_into_row(ele, pred):
+
+    def preprocess_depth(x):
+        y = np.squeeze(x.data.cpu().numpy())
+        return depth_colorize(y)
+
+    # if is gray, transforms to rgb
+    img_list = []
+    if 'rgb' in ele:
+        rgb = np.squeeze(ele['rgb'][0, ...].data.cpu().numpy())
+        rgb = np.transpose(rgb, (1, 2, 0))
+        img_list.append(rgb)
+    elif 'g' in ele:
+        g = np.squeeze(ele['g'][0, ...].data.cpu().numpy())
+        g = np.array(Image.fromarray(g).convert('RGB'))
+        img_list.append(g)
+    if 'd' in ele:
+        img_list.append(preprocess_depth(ele['d'][0, ...]))
+    img_list.append(preprocess_depth(pred[0, ...]))
+    if 'gt' in ele:
+        img_list.append(preprocess_depth(ele['gt'][0, ...]))
+
+    img_merge = np.hstack(img_list)
+    return img_merge.astype('uint8')
+
+
+def add_row(img_merge, row):
+    return np.vstack([img_merge, row])
+
+
+def save_image(img_merge, filename):
+    image_to_write = cv2.cvtColor(img_merge, cv2.COLOR_RGB2BGR)
+    cv2.imwrite(filename, image_to_write)
+
+
+def save_depth_as_uint16png(img, filename):
+    img = (img * 256).astype('uint16')
+    cv2.imwrite(filename, img)
+
+
+if ('DISPLAY' in os.environ):
+    f, axarr = plt.subplots(4, 1)
+    plt.tight_layout()
+    plt.ion()
+
+
+def display_warping(rgb_tgt, pred_tgt, warped):
+
+    def preprocess(rgb_tgt, pred_tgt, warped):
+        rgb_tgt = 255 * np.transpose(
+            np.squeeze(rgb_tgt.data.cpu().numpy()), (1, 2, 0))  # H, W, C
+        # depth = np.squeeze(depth.cpu().numpy())
+        # depth = depth_colorize(depth)
+
+        # convert to log-scale
+        pred_tgt = np.squeeze(pred_tgt.data.cpu().numpy())
+        # pred_tgt[pred_tgt<=0] = 0.9 # remove negative predictions
+        # pred_tgt = np.log10(pred_tgt)
+
+        pred_tgt = depth_colorize(pred_tgt)
+
+        warped = 255 * np.transpose(
+            np.squeeze(warped.data.cpu().numpy()), (1, 2, 0))  # H, W, C
+        recon_err = np.absolute(
+            warped.astype('float') - rgb_tgt.astype('float')) * (
+                warped > 0)
+        recon_err = recon_err[:, :, 0] + recon_err[:, :, 1] + recon_err[:, :,
+                                                                        2]
+        recon_err = depth_colorize(recon_err)
+        return rgb_tgt.astype('uint8'), warped.astype(
+            'uint8'), recon_err, pred_tgt
+
+    rgb_tgt, warped, recon_err, pred_tgt = preprocess(rgb_tgt, pred_tgt,
+                                                      warped)
+
+    # 1st column
+    # column = 0
+    axarr[0].imshow(rgb_tgt)
+    axarr[0].axis('off')
+    axarr[0].axis('equal')
+    # axarr[0, column].set_title('rgb_tgt')
+
+    axarr[1].imshow(warped)
+    axarr[1].axis('off')
+    axarr[1].axis('equal')
+    # axarr[1, column].set_title('warped')
+
+    axarr[2].imshow(recon_err, 'hot')
+    axarr[2].axis('off')
+    axarr[2].axis('equal')
+    # axarr[2, column].set_title('recon_err error')
+
+    axarr[3].imshow(pred_tgt, 'hot')
+    axarr[3].axis('off')
+    axarr[3].axis('equal')
+    # axarr[3, column].set_title('pred_tgt')
+
+    # plt.show()
+    plt.pause(0.001)
diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py
index 0d4027cb7..a1de71a97 100644
--- a/modelscope/models/cv/shop_segmentation/head_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/head_fpn.py
@@ -9,8 +9,7 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from timm.models.layers.drop import drop_path
-from timm.models.layers.weight_init import trunc_normal_
+from timm.models.layers import drop_path, trunc_normal_
 
 from .common import Upsample, resize
 
diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py
index a206e9f1c..e6c389d66 100644
--- a/modelscope/models/cv/shop_segmentation/models.py
+++ b/modelscope/models/cv/shop_segmentation/models.py
@@ -11,8 +11,7 @@
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint as checkpoint
-from timm.models.layers.drop import drop_path
-from timm.models.layers.weight_init import trunc_normal_
+from timm.models.layers import drop_path, trunc_normal_
 from torch import nn
 
 
diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py
index d344de713..1b63bcd16 100644
--- a/modelscope/models/cv/shop_segmentation/neck_fpn.py
+++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py
@@ -8,8 +8,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from timm.models.layers.drop import drop_path
-from timm.models.layers.weight_init import trunc_normal_
+from timm.models.layers import drop_path, trunc_normal_
 
 from .common import resize
 
diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py
index 1cec5f397..9dd40d0eb 100644
--- a/modelscope/models/cv/text_driven_segmentation/clip.py
+++ b/modelscope/models/cv/text_driven_segmentation/clip.py
@@ -8,9 +8,10 @@
 import warnings
 from typing import Any, List, Union
 
+import packaging
+import packaging.version
 import torch
 from PIL import Image
-from pkg_resources import packaging
 from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
                                     ToTensor)
 from tqdm import tqdm
diff --git a/modelscope/models/cv/video_depth_estimation/utils/depth.py b/modelscope/models/cv/video_depth_estimation/utils/depth.py
index 5fbf6aa6d..2fc16a01e 100644
--- a/modelscope/models/cv/video_depth_estimation/utils/depth.py
+++ b/modelscope/models/cv/video_depth_estimation/utils/depth.py
@@ -3,7 +3,9 @@
 import numpy as np
 import torch
 import torchvision.transforms as transforms
-from matplotlib.cm import get_cmap
+#  from matplotlib.cm import get_cmap
+#  compatible with matplotlib 3.9.0
+from matplotlib.pyplot import get_cmap
 
 from modelscope.models.cv.video_depth_estimation.utils.image import (
     flip_lr, gradient_x, gradient_y, interpolate_image, load_image)
diff --git a/modelscope/models/cv/video_frame_interpolation/__init__.py b/modelscope/models/cv/video_frame_interpolation/__init__.py
index 657a375ad..11492faf0 100644
--- a/modelscope/models/cv/video_frame_interpolation/__init__.py
+++ b/modelscope/models/cv/video_frame_interpolation/__init__.py
@@ -5,9 +5,10 @@
 
 if TYPE_CHECKING:
     from .VFINet_arch import VFINet
+    from .rife import RIFEModel
 
 else:
-    _import_structure = {'VFINet_arch': ['VFINet']}
+    _import_structure = {'VFINet_arch': ['VFINet'], 'rife': ['RIFEModel']}
 
     import sys
 
diff --git a/modelscope/models/cv/video_frame_interpolation/rife/IFNet_HDv3.py b/modelscope/models/cv/video_frame_interpolation/rife/IFNet_HDv3.py
new file mode 100644
index 000000000..e904aad28
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/rife/IFNet_HDv3.py
@@ -0,0 +1,158 @@
+# The implementation here is modified based on ECCV2022-RIFE,
+# originally MIT License, Copyright  (c)  Megvii  Inc.,
+# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .warplayer import warp
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+def conv(in_planes,
+         out_planes,
+         kernel_size=3,
+         stride=1,
+         padding=1,
+         dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=True), nn.PReLU(out_planes))
+
+
+def conv_bn(in_planes,
+            out_planes,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            dilation=1):
+    return nn.Sequential(
+        nn.Conv2d(
+            in_planes,
+            out_planes,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=False), nn.BatchNorm2d(out_planes), nn.PReLU(out_planes))
+
+
+class IFBlock(nn.Module):
+
+    def __init__(self, in_planes, c=64):
+        super(IFBlock, self).__init__()
+        self.conv0 = nn.Sequential(
+            conv(in_planes, c // 2, 3, 2, 1),
+            conv(c // 2, c, 3, 2, 1),
+        )
+        self.convblock0 = nn.Sequential(conv(c, c), conv(c, c))
+        self.convblock1 = nn.Sequential(conv(c, c), conv(c, c))
+        self.convblock2 = nn.Sequential(conv(c, c), conv(c, c))
+        self.convblock3 = nn.Sequential(conv(c, c), conv(c, c))
+        self.conv1 = nn.Sequential(
+            nn.ConvTranspose2d(c, c // 2, 4, 2, 1),
+            nn.PReLU(c // 2),
+            nn.ConvTranspose2d(c // 2, 4, 4, 2, 1),
+        )
+        self.conv2 = nn.Sequential(
+            nn.ConvTranspose2d(c, c // 2, 4, 2, 1),
+            nn.PReLU(c // 2),
+            nn.ConvTranspose2d(c // 2, 1, 4, 2, 1),
+        )
+
+    def forward(self, x, flow, scale=1):
+        x = F.interpolate(
+            x,
+            scale_factor=1. / scale,
+            mode='bilinear',
+            align_corners=False,
+            recompute_scale_factor=False)
+        flow = F.interpolate(
+            flow,
+            scale_factor=1. / scale,
+            mode='bilinear',
+            align_corners=False,
+            recompute_scale_factor=False) * 1. / scale
+        feat = self.conv0(torch.cat((x, flow), 1))
+        feat = self.convblock0(feat) + feat
+        feat = self.convblock1(feat) + feat
+        feat = self.convblock2(feat) + feat
+        feat = self.convblock3(feat) + feat
+        flow = self.conv1(feat)
+        mask = self.conv2(feat)
+        flow = F.interpolate(
+            flow,
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False,
+            recompute_scale_factor=False) * scale
+        mask = F.interpolate(
+            mask,
+            scale_factor=scale,
+            mode='bilinear',
+            align_corners=False,
+            recompute_scale_factor=False)
+        return flow, mask
+
+
+class IFNet(nn.Module):
+
+    def __init__(self):
+        super(IFNet, self).__init__()
+        self.block0 = IFBlock(7 + 4, c=90)
+        self.block1 = IFBlock(7 + 4, c=90)
+        self.block2 = IFBlock(7 + 4, c=90)
+        self.block_tea = IFBlock(10 + 4, c=90)
+        # self.contextnet = Contextnet()
+        # self.unet = Unet()
+
+    def forward(self, x, scale_list=[4, 2, 1], training=False):
+        if training is False:
+            channel = x.shape[1] // 2
+            img0 = x[:, :channel]
+            img1 = x[:, channel:]
+        flow_list = []
+        merged = []
+        mask_list = []
+        warped_img0 = img0
+        warped_img1 = img1
+        flow = (x[:, :4]).detach() * 0
+        mask = (x[:, :1]).detach() * 0
+        # loss_cons = 0
+        block = [self.block0, self.block1, self.block2]
+        for i in range(3):
+            f0, m0 = block[i](
+                torch.cat((warped_img0[:, :3], warped_img1[:, :3], mask), 1),
+                flow,
+                scale=scale_list[i])
+            f1, m1 = block[i](
+                torch.cat((warped_img1[:, :3], warped_img0[:, :3], -mask), 1),
+                torch.cat((flow[:, 2:4], flow[:, :2]), 1),
+                scale=scale_list[i])
+            flow = flow + (f0 + torch.cat((f1[:, 2:4], f1[:, :2]), 1)) / 2
+            mask = mask + (m0 + (-m1)) / 2
+            mask_list.append(mask)
+            flow_list.append(flow)
+            warped_img0 = warp(img0, flow[:, :2])
+            warped_img1 = warp(img1, flow[:, 2:4])
+            merged.append((warped_img0, warped_img1))
+        '''
+        c0 = self.contextnet(img0, flow[:, :2])
+        c1 = self.contextnet(img1, flow[:, 2:4])
+        tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1)
+        res = tmp[:, 1:4] * 2 - 1
+        '''
+        for i in range(3):
+            mask_list[i] = torch.sigmoid(mask_list[i])
+            merged[i] = merged[i][0] * mask_list[i] + merged[i][1] * (
+                1 - mask_list[i])
+            # merged[i] = torch.clamp(merged[i] + res, 0, 1)
+        return flow_list, mask_list[2], merged
diff --git a/modelscope/models/cv/video_frame_interpolation/rife/RIFE_HDv3.py b/modelscope/models/cv/video_frame_interpolation/rife/RIFE_HDv3.py
new file mode 100644
index 000000000..090b7cd76
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/rife/RIFE_HDv3.py
@@ -0,0 +1,124 @@
+# The implementation here is modified based on ECCV2022-RIFE,
+# originally MIT License, Copyright  (c)  Megvii  Inc.,
+# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE
+
+import itertools
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import AdamW
+
+from modelscope.metainfo import Models
+from modelscope.models.base import Tensor
+from modelscope.models.base.base_torch_model import TorchModel
+from modelscope.models.builder import MODELS
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+from .IFNet_HDv3 import *
+from .loss import *
+from .warplayer import warp
+
+
+@MODELS.register_module(
+    Tasks.video_frame_interpolation, module_name=Models.rife)
+class RIFEModel(TorchModel):
+
+    def __init__(self, model_dir, *args, **kwargs):
+        super().__init__(model_dir, *args, **kwargs)
+        self.device = torch.device(
+            'cuda' if torch.cuda.is_available() else 'cpu')
+        self.flownet = IFNet()
+        self.flownet.to(self.device)
+        self.optimG = AdamW(
+            self.flownet.parameters(), lr=1e-6, weight_decay=1e-4)
+        self.epe = EPE()
+        # self.vgg = VGGPerceptualLoss().to(device)
+        self.sobel = SOBEL()
+        self.load_model(model_dir, -1)
+        self.eval()
+
+    def train(self):
+        self.flownet.train()
+
+    def eval(self):
+        self.flownet.eval()
+
+    def load_model(self, path, rank=0):
+
+        def convert(param):
+            if rank == -1:
+                return {
+                    k.replace('module.', ''): v
+                    for k, v in param.items() if 'module.' in k
+                }
+            else:
+                return param
+
+        if rank <= 0:
+            if torch.cuda.is_available():
+                self.flownet.load_state_dict(
+                    convert(torch.load('{}/flownet.pkl'.format(path))))
+            else:
+                self.flownet.load_state_dict(
+                    convert(
+                        torch.load(
+                            '{}/flownet.pkl'.format(path),
+                            map_location='cpu')))
+
+    def save_model(self, path, rank=0):
+        if rank == 0:
+            torch.save(self.flownet.state_dict(),
+                       '{}/flownet.pkl'.format(path))
+
+    def inference(self, img0, img1, scale=1.0):
+        imgs = torch.cat((img0, img1), 1)
+        scale_list = [4 / scale, 2 / scale, 1 / scale]
+        _, _, merged = self.flownet(imgs, scale_list)
+        return merged[2].detach()
+
+    def forward(self, inputs):
+        img0 = inputs['img0']
+        img1 = inputs['img1']
+        scale = inputs['scale']
+        return {'output': self.inference(img0, img1, scale)}
+
+    def update(self,
+               imgs,
+               gt,
+               learning_rate=0,
+               mul=1,
+               training=True,
+               flow_gt=None):
+        for param_group in self.optimG.param_groups:
+            param_group['lr'] = learning_rate
+        # img0 = imgs[:, :3]
+        # img1 = imgs[:, 3:]
+        if training:
+            self.train()
+        else:
+            self.eval()
+        scale = [4, 2, 1]
+        flow, mask, merged = self.flownet(
+            torch.cat((imgs, gt), 1), scale=scale, training=training)
+        loss_l1 = (merged[2] - gt).abs().mean()
+        loss_smooth = self.sobel(flow[2], flow[2] * 0).mean()
+        # loss_vgg = self.vgg(merged[2], gt)
+        if training:
+            self.optimG.zero_grad()
+            loss_G = loss_cons + loss_smooth * 0.1
+            loss_G.backward()
+            self.optimG.step()
+        # else:
+        #     flow_teacher = flow[2]
+        return merged[2], {
+            'mask': mask,
+            'flow': flow[2][:, :2],
+            'loss_l1': loss_l1,
+            'loss_cons': loss_cons,
+            'loss_smooth': loss_smooth,
+        }
diff --git a/modelscope/models/cv/video_frame_interpolation/rife/__init__.py b/modelscope/models/cv/video_frame_interpolation/rife/__init__.py
new file mode 100644
index 000000000..af475199c
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/rife/__init__.py
@@ -0,0 +1,5 @@
+# The implementation here is modified based on ECCV2022-RIFE,
+# originally MIT License, Copyright  (c)  Megvii  Inc.,
+# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE
+
+from .RIFE_HDv3 import RIFEModel
diff --git a/modelscope/models/cv/video_frame_interpolation/rife/loss.py b/modelscope/models/cv/video_frame_interpolation/rife/loss.py
new file mode 100644
index 000000000..97f7644ca
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/rife/loss.py
@@ -0,0 +1,144 @@
+# The implementation here is modified based on ECCV2022-RIFE,
+# originally MIT License, Copyright  (c)  Megvii  Inc.,
+# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.models as models
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+
+class EPE(nn.Module):
+
+    def __init__(self):
+        super(EPE, self).__init__()
+
+    def forward(self, flow, gt, loss_mask):
+        loss_map = (flow - gt.detach())**2
+        loss_map = (loss_map.sum(1, True) + 1e-6)**0.5
+        return (loss_map * loss_mask)
+
+
+class Ternary(nn.Module):
+
+    def __init__(self):
+        super(Ternary, self).__init__()
+        patch_size = 7
+        out_channels = patch_size * patch_size
+        self.w = np.eye(out_channels).reshape(
+            (patch_size, patch_size, 1, out_channels))
+        self.w = np.transpose(self.w, (3, 2, 0, 1))
+        self.w = torch.tensor(self.w).float().to(device)
+
+    def transform(self, img):
+        patches = F.conv2d(img, self.w, padding=3, bias=None)
+        transf = patches - img
+        transf_norm = transf / torch.sqrt(0.81 + transf**2)
+        return transf_norm
+
+    def rgb2gray(self, rgb):
+        r, g, b = rgb[:, 0:1, :, :], rgb[:, 1:2, :, :], rgb[:, 2:3, :, :]
+        gray = 0.2989 * r + 0.5870 * g + 0.1140 * b
+        return gray
+
+    def hamming(self, t1, t2):
+        dist = (t1 - t2)**2
+        dist_norm = torch.mean(dist / (0.1 + dist), 1, True)
+        return dist_norm
+
+    def valid_mask(self, t, padding):
+        n, _, h, w = t.size()
+        inner = torch.ones(n, 1, h - 2 * padding, w - 2 * padding).type_as(t)
+        mask = F.pad(inner, [padding] * 4)
+        return mask
+
+    def forward(self, img0, img1):
+        img0 = self.transform(self.rgb2gray(img0))
+        img1 = self.transform(self.rgb2gray(img1))
+        return self.hamming(img0, img1) * self.valid_mask(img0, 1)
+
+
+class SOBEL(nn.Module):
+
+    def __init__(self):
+        super(SOBEL, self).__init__()
+        self.kernelX = torch.tensor([
+            [1, 0, -1],
+            [2, 0, -2],
+            [1, 0, -1],
+        ]).float()
+        self.kernelY = self.kernelX.clone().T
+        self.kernelX = self.kernelX.unsqueeze(0).unsqueeze(0).to(device)
+        self.kernelY = self.kernelY.unsqueeze(0).unsqueeze(0).to(device)
+
+    def forward(self, pred, gt):
+        N, C, H, W = pred.shape[0], pred.shape[1], pred.shape[2], pred.shape[3]
+        img_stack = torch.cat(
+            [pred.reshape(N * C, 1, H, W),
+             gt.reshape(N * C, 1, H, W)], 0)
+        sobel_stack_x = F.conv2d(img_stack, self.kernelX, padding=1)
+        sobel_stack_y = F.conv2d(img_stack, self.kernelY, padding=1)
+        pred_X, gt_X = sobel_stack_x[:N * C], sobel_stack_x[N * C:]
+        pred_Y, gt_Y = sobel_stack_y[:N * C], sobel_stack_y[N * C:]
+
+        L1X, L1Y = torch.abs(pred_X - gt_X), torch.abs(pred_Y - gt_Y)
+        loss = (L1X + L1Y)
+        return loss
+
+
+class MeanShift(nn.Conv2d):
+
+    def __init__(self, data_mean, data_std, data_range=1, norm=True):
+        c = len(data_mean)
+        super(MeanShift, self).__init__(c, c, kernel_size=1)
+        std = torch.Tensor(data_std)
+        self.weight.data = torch.eye(c).view(c, c, 1, 1)
+        if norm:
+            self.weight.data.div_(std.view(c, 1, 1, 1))
+            self.bias.data = -1 * data_range * torch.Tensor(data_mean)
+            self.bias.data.div_(std)
+        else:
+            self.weight.data.mul_(std.view(c, 1, 1, 1))
+            self.bias.data = data_range * torch.Tensor(data_mean)
+        self.requires_grad = False
+
+
+class VGGPerceptualLoss(torch.nn.Module):
+
+    def __init__(self, rank=0):
+        super(VGGPerceptualLoss, self).__init__()
+        # blocks = []
+        pretrained = True
+        self.vgg_pretrained_features = models.vgg19(
+            pretrained=pretrained).features
+        self.normalize = MeanShift([0.485, 0.456, 0.406],
+                                   [0.229, 0.224, 0.225],
+                                   norm=True).cuda()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, X, Y, indices=None):
+        X = self.normalize(X)
+        Y = self.normalize(Y)
+        indices = [2, 7, 12, 21, 30]
+        weights = [1.0 / 2.6, 1.0 / 4.8, 1.0 / 3.7, 1.0 / 5.6, 10 / 1.5]
+        k = 0
+        loss = 0
+        for i in range(indices[-1]):
+            X = self.vgg_pretrained_features[i](X)
+            Y = self.vgg_pretrained_features[i](Y)
+            if (i + 1) in indices:
+                loss += weights[k] * (X - Y.detach()).abs().mean() * 0.1
+                k += 1
+        return loss
+
+
+if __name__ == '__main__':
+    img0 = torch.zeros(3, 3, 256, 256).float().to(device)
+    img1 = torch.tensor(np.random.normal(0, 1,
+                                         (3, 3, 256, 256))).float().to(device)
+    ternary_loss = Ternary()
+    print(ternary_loss(img0, img1).shape)
diff --git a/modelscope/models/cv/video_frame_interpolation/rife/warplayer.py b/modelscope/models/cv/video_frame_interpolation/rife/warplayer.py
new file mode 100644
index 000000000..e4440e6f3
--- /dev/null
+++ b/modelscope/models/cv/video_frame_interpolation/rife/warplayer.py
@@ -0,0 +1,40 @@
+# The implementation here is modified based on ECCV2022-RIFE,
+# originally MIT License, Copyright  (c)  Megvii  Inc.,
+# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE
+
+import torch
+import torch.nn as nn
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+backwarp_tenGrid = {}
+
+
+def warp(tenInput, tenFlow):
+    k = (str(tenFlow.device), str(tenFlow.size()))
+    if k not in backwarp_tenGrid:
+        tenHorizontal = torch.linspace(
+            -1.0, 1.0, tenFlow.shape[3], device=device).view(
+                1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1,
+                                                  tenFlow.shape[2], -1)
+        tenVertical = torch.linspace(
+            -1.0, 1.0, tenFlow.shape[2],
+            device=device).view(1, 1, tenFlow.shape[2],
+                                1).expand(tenFlow.shape[0], -1, -1,
+                                          tenFlow.shape[3])
+        backwarp_tenGrid[k] = torch.cat([tenHorizontal, tenVertical],
+                                        1).to(device)
+
+    tenFlow = torch.cat(
+        [
+            tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0),  # no qa
+            tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0)
+        ],
+        1)  # no qa
+
+    g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(
+        input=tenInput,
+        grid=g,
+        mode='bilinear',
+        padding_mode='border',
+        align_corners=True)
diff --git a/modelscope/models/cv/video_stabilization/DUT/config.py b/modelscope/models/cv/video_stabilization/DUT/config.py
index 85c33bc3c..dde1d11fe 100644
--- a/modelscope/models/cv/video_stabilization/DUT/config.py
+++ b/modelscope/models/cv/video_stabilization/DUT/config.py
@@ -64,7 +64,7 @@
 # scale strength weight
 __C.TRAIN.scale_com_strength = 100.0
 
-# non maximum supression threshold
+# non maximum suppression threshold
 __C.TRAIN.NMS_THRESH = 0.0
 
 # nms kernel size
diff --git a/modelscope/models/cv/vidt/backbone.py b/modelscope/models/cv/vidt/backbone.py
index 198ab498d..bcfcff9fb 100644
--- a/modelscope/models/cv/vidt/backbone.py
+++ b/modelscope/models/cv/vidt/backbone.py
@@ -440,7 +440,7 @@ def forward(self, x, mask_matrix, pos, cross_attn, cross_attn_mask):
             det = det + det_pos
             shifted_x = (shifted_x, cross_patch)
         else:
-            # it cross_attn is deativated, only [PATCH] and [DET] self-attention are performed
+            # it cross_attn is deactivated, only [PATCH] and [DET] self-attention are performed
             det = det + det_pos
             shifted_x = shifted_x
 
@@ -961,7 +961,7 @@ def finetune_det(self,
                 block.det_token_num = det_token_num
                 block.det_pos_linear = nn.Linear(pos_dim, block.dim)
 
-        # neck-free model do not require downsamling at the last stage.
+        # neck-free model do not require downsampling at the last stage.
         if method == 'vidt_wo_neck':
             self.layers[-1].downsample = None
 
diff --git a/modelscope/models/cv/vidt/fpn_fusion.py b/modelscope/models/cv/vidt/fpn_fusion.py
index b48ba0feb..f0531c828 100644
--- a/modelscope/models/cv/vidt/fpn_fusion.py
+++ b/modelscope/models/cv/vidt/fpn_fusion.py
@@ -30,7 +30,7 @@ def forward(self, x_blocks):
 
         x_blocks = x_blocks
 
-        # preperation: channel reduction and normalization
+        # preparation: channel reduction and normalization
         for idx in range(self.n_block - 1, -1, -1):
             x_blocks[idx] = getattr(self.multi_scaler, f'layer_{idx}_rn')(
                 x_blocks[idx])
@@ -111,8 +111,8 @@ def __init__(self,
             features (int): channel dim of the input feature
             activation: activation function to use
             bn: whether to use bn
-            expand: whether to exapnd feature or not
-            align_corners: wheter to use align_corners for interpolation
+            expand: whether to expand feature or not
+            align_corners: whether to use align_corners for interpolation
         """
 
         super(FeatureFusionBlock, self).__init__()
diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py
index 1ee715c91..36479d565 100644
--- a/modelscope/models/multi_modal/clip/bert_tokenizer.py
+++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py
@@ -157,7 +157,7 @@ def whitespace_tokenize(text):
 
 
 class FullTokenizer(object):
-    """Runs end-to-end tokenziation."""
+    """Runs end-to-end tokenization."""
 
     def __init__(self, vocab_file, do_lower_case=True):
         self.vocab = load_vocab(vocab_file)
@@ -185,7 +185,7 @@ def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True):
 
         def clean_up_tokenization(out_string):
             """ Clean up a list of simple English tokenization artifacts
-            like spaces before punctuations and abreviated forms.
+            like spaces before punctuations and abbreviated forms.
             """
             out_string = (
                 out_string.replace(' .', '.').replace(' ?', '?').replace(
@@ -321,7 +321,7 @@ def _clean_text(self, text):
 
 
 class WordpieceTokenizer(object):
-    """Runs WordPiece tokenziation."""
+    """Runs WordPiece tokenization."""
 
     def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200):
         self.vocab = vocab
@@ -384,7 +384,7 @@ def tokenize(self, text):
 
 def _is_whitespace(char):
     """Checks whether `chars` is a whitespace character."""
-    # \t, \n, and \r are technically contorl characters but we treat them
+    # \t, \n, and \r are technically control characters but we treat them
     # as whitespace since they are generally considered as such.
     if char == ' ' or char == '\t' or char == '\n' or char == '\r':
         return True
diff --git a/modelscope/models/multi_modal/clip/configuration_bert.py b/modelscope/models/multi_modal/clip/configuration_bert.py
index b75f5db89..b1a3966b2 100644
--- a/modelscope/models/multi_modal/clip/configuration_bert.py
+++ b/modelscope/models/multi_modal/clip/configuration_bert.py
@@ -37,7 +37,7 @@ class BertConfig(object):
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
                 probabilities.
@@ -46,7 +46,7 @@ class BertConfig(object):
                 (e.g., 512 or 1024 or 2048).
             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                 `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
+            initializer_range: The stdev of the truncated_normal_initializer for
                 initializing all weight matrices.
             layer_norm_eps: The epsilon used by LayerNorm.
     """
diff --git a/modelscope/models/multi_modal/clip/modeling_bert.py b/modelscope/models/multi_modal/clip/modeling_bert.py
index 11c5c8338..7491d40ed 100644
--- a/modelscope/models/multi_modal/clip/modeling_bert.py
+++ b/modelscope/models/multi_modal/clip/modeling_bert.py
@@ -485,7 +485,7 @@ def forward(self,
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(
                     -1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters(
-            )).dtype)  # switch to fload if need + fp16 compatibility
+            )).dtype)  # switch to float if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
diff --git a/modelscope/models/multi_modal/clip_interrogator/model.py b/modelscope/models/multi_modal/clip_interrogator/model.py
index a7e27cbd0..c04d7a9b2 100644
--- a/modelscope/models/multi_modal/clip_interrogator/model.py
+++ b/modelscope/models/multi_modal/clip_interrogator/model.py
@@ -1,4 +1,4 @@
-# This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at
+# This implementation is adopted from CLIP-Interrogator, made publicly available under the MIT License at
 # https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py
 
 import hashlib
diff --git a/modelscope/models/multi_modal/diffusion/structbert.py b/modelscope/models/multi_modal/diffusion/structbert.py
index 0ca57fc4a..764cd0906 100644
--- a/modelscope/models/multi_modal/diffusion/structbert.py
+++ b/modelscope/models/multi_modal/diffusion/structbert.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba inc.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team and Alibaba inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -79,7 +79,7 @@ def __init__(self,
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
                 encoder and pooler.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
                 probabilities.
@@ -88,7 +88,7 @@ def __init__(self,
                 (e.g., 512 or 1024 or 2048).
             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                 `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
+            initializer_range: The stdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
         self.vocab_size = vocab_size
diff --git a/modelscope/models/multi_modal/diffusion/tokenizer.py b/modelscope/models/multi_modal/diffusion/tokenizer.py
index 918498cd8..ef57b63c7 100644
--- a/modelscope/models/multi_modal/diffusion/tokenizer.py
+++ b/modelscope/models/multi_modal/diffusion/tokenizer.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba inc.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team and Alibaba inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py b/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py
index aaa588d30..091aeca57 100644
--- a/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py
+++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py
@@ -1,6 +1,6 @@
 # Copyright 2023-2024 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # The implementation is adopted from HighCWu,
-# made pubicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA
+# made publicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA
 
 import os
 from dataclasses import dataclass
diff --git a/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py b/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py
index 79ac2c33b..688378fc1 100644
--- a/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py
+++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py
@@ -1,6 +1,6 @@
 # Copyright 2023-2024 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # The implementation is adopted from HighCWu,
-# made pubicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA
+# made publicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA
 import os
 import os.path as osp
 from functools import partial
diff --git a/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py b/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py
index 306ca2b0c..8abd9735d 100644
--- a/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py
+++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py
@@ -1,6 +1,6 @@
 # Copyright 2023-2024 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 # The implementation is adopted from HighCWu,
-# made pubicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA
+# made publicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA
 import os
 from dataclasses import dataclass
 from typing import List, Tuple, Union
diff --git a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
index c7ac3f947..d80c6f802 100644
--- a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
+++ b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from Huaishao Luo,
-# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
 
 import cv2
 import numpy as np
diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
index 743c049ad..6a54f0a5d 100644
--- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
+++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import os
 import random
diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
index c2d96275d..48733de49 100644
--- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
+++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import numpy as np
 
diff --git a/modelscope/models/multi_modal/mmr/models/module_clip.py b/modelscope/models/multi_modal/mmr/models/module_clip.py
index 535017203..479ebfb31 100644
--- a/modelscope/models/multi_modal/mmr/models/module_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/module_clip.py
@@ -1,5 +1,5 @@
 # The implementation is  adopated from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import hashlib
 import os
diff --git a/modelscope/models/multi_modal/mmr/models/module_cross.py b/modelscope/models/multi_modal/mmr/models/module_cross.py
index b958d5bca..f4327f8ca 100644
--- a/modelscope/models/multi_modal/mmr/models/module_cross.py
+++ b/modelscope/models/multi_modal/mmr/models/module_cross.py
@@ -1,5 +1,5 @@
 # The implementation is  adopated from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 from __future__ import absolute_import, division, print_function
 import logging
diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
index 97ee7156a..5dc5ff6d9 100644
--- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import gzip
 import html
diff --git a/modelscope/models/multi_modal/mmr/models/until_module.py b/modelscope/models/multi_modal/mmr/models/until_module.py
index 24e886b0f..fcc94dfe5 100644
--- a/modelscope/models/multi_modal/mmr/models/until_module.py
+++ b/modelscope/models/multi_modal/mmr/models/until_module.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/multi_modal/mplug/predictor.py b/modelscope/models/multi_modal/mplug/predictor.py
index 6375d1d7e..b6165e655 100755
--- a/modelscope/models/multi_modal/mplug/predictor.py
+++ b/modelscope/models/multi_modal/mplug/predictor.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
index 76ab11708..e6e7d9ac9 100644
--- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
+++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py
@@ -6,10 +6,10 @@
 
 import json
 import numpy as np
+import packaging
 import torch
 import torch.cuda
 from PIL import Image
-from pkg_resources import packaging
 from taming.models.vqgan import GumbelVQ, VQModel
 from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
                                     ToTensor)
diff --git a/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py
index c7ac3f947..d80c6f802 100644
--- a/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py
+++ b/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from Huaishao Luo,
-# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip
 
 import cv2
 import numpy as np
diff --git a/modelscope/models/multi_modal/prost/models/module_clip.py b/modelscope/models/multi_modal/prost/models/module_clip.py
index c5aaa1e52..b340822ce 100644
--- a/modelscope/models/multi_modal/prost/models/module_clip.py
+++ b/modelscope/models/multi_modal/prost/models/module_clip.py
@@ -1,5 +1,5 @@
 # The implementation is  adopated from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import hashlib
 import os
diff --git a/modelscope/models/multi_modal/prost/models/module_cross.py b/modelscope/models/multi_modal/prost/models/module_cross.py
index fae8e904b..ccfd50e6a 100644
--- a/modelscope/models/multi_modal/prost/models/module_cross.py
+++ b/modelscope/models/multi_modal/prost/models/module_cross.py
@@ -51,7 +51,7 @@ def __init__(self,
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
                 probabilities.
@@ -60,7 +60,7 @@ def __init__(self,
                 (e.g., 512 or 1024 or 2048).
             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                 `CrossModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
+            initializer_range: The stdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
         if isinstance(vocab_size_or_config_json_file, str):
diff --git a/modelscope/models/multi_modal/prost/models/prost_model.py b/modelscope/models/multi_modal/prost/models/prost_model.py
index 022903cb7..f3b5947bb 100644
--- a/modelscope/models/multi_modal/prost/models/prost_model.py
+++ b/modelscope/models/multi_modal/prost/models/prost_model.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import os
 import random
diff --git a/modelscope/models/multi_modal/prost/models/tokenization_clip.py b/modelscope/models/multi_modal/prost/models/tokenization_clip.py
index 97ee7156a..5dc5ff6d9 100644
--- a/modelscope/models/multi_modal/prost/models/tokenization_clip.py
+++ b/modelscope/models/multi_modal/prost/models/tokenization_clip.py
@@ -1,5 +1,5 @@
 # The implementation is adopted from the CLIP4Clip implementation,
-# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
+# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip
 
 import gzip
 import html
diff --git a/modelscope/models/multi_modal/prost/models/until_config.py b/modelscope/models/multi_modal/prost/models/until_config.py
index dc9753d3e..8dc56375a 100755
--- a/modelscope/models/multi_modal/prost/models/until_config.py
+++ b/modelscope/models/multi_modal/prost/models/until_config.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/multi_modal/prost/models/until_module.py b/modelscope/models/multi_modal/prost/models/until_module.py
index 20afc2c3b..c072445ad 100644
--- a/modelscope/models/multi_modal/prost/models/until_module.py
+++ b/modelscope/models/multi_modal/prost/models/until_module.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/multi_modal/video_synthesis/autoencoder.py b/modelscope/models/multi_modal/video_synthesis/autoencoder.py
index 7885f2626..34bcee1b0 100644
--- a/modelscope/models/multi_modal/video_synthesis/autoencoder.py
+++ b/modelscope/models/multi_modal/video_synthesis/autoencoder.py
@@ -1,5 +1,5 @@
 # Part of the implementation is borrowed and modified from latent-diffusion,
-# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# publicly available at https://github.com/CompVis/latent-diffusion.
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import numpy as np
diff --git a/modelscope/models/multi_modal/video_synthesis/diffusion.py b/modelscope/models/multi_modal/video_synthesis/diffusion.py
index 138fddae3..2c4d4f6d2 100644
--- a/modelscope/models/multi_modal/video_synthesis/diffusion.py
+++ b/modelscope/models/multi_modal/video_synthesis/diffusion.py
@@ -1,5 +1,5 @@
 # Part of the implementation is borrowed and modified from latent-diffusion,
-# publicly avaialbe at https://github.com/CompVis/latent-diffusion.
+# publicly available at https://github.com/CompVis/latent-diffusion.
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import torch
diff --git a/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py b/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py
index 0ec66069f..76f30580d 100644
--- a/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py
+++ b/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py
@@ -58,7 +58,7 @@ def __init__(self, model_dir, *args, **kwargs):
                       `True`.
         """
         super().__init__(model_dir=model_dir, *args, **kwargs)
-        self.device = torch.device('cuda') if torch.cuda.is_available() \
+        self.device = torch.device(kwargs.get('device', 'cuda')) if torch.cuda.is_available() \
             else torch.device('cpu')
         self.config = Config.from_file(
             osp.join(model_dir, ModelFile.CONFIGURATION))
diff --git a/modelscope/models/multi_modal/video_synthesis/unet_sd.py b/modelscope/models/multi_modal/video_synthesis/unet_sd.py
index f3c764eb2..779320e28 100644
--- a/modelscope/models/multi_modal/video_synthesis/unet_sd.py
+++ b/modelscope/models/multi_modal/video_synthesis/unet_sd.py
@@ -1,5 +1,5 @@
 # Part of the implementation is borrowed and modified from stable-diffusion,
-# publicly avaialbe at https://github.com/Stability-AI/stablediffusion.
+# publicly available at https://github.com/Stability-AI/stablediffusion.
 # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
 
 import math
diff --git a/modelscope/models/nlp/bart/text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py
index 97c3a7a9e..09b867c4c 100644
--- a/modelscope/models/nlp/bart/text_error_correction.py
+++ b/modelscope/models/nlp/bart/text_error_correction.py
@@ -82,5 +82,5 @@ def forward(self, input: Dict[str, Dict]) -> TextErrorCorrectionOutput:
         batch_preds = []
         for i in range(batch_size):
             # get 1-best List[Tensor]
-            batch_preds.append(translations[i][0]['tokens'])
+            batch_preds.append(translations[i][0]['tokens'].tolist())
         return TextErrorCorrectionOutput(predictions=batch_preds)
diff --git a/modelscope/models/nlp/dgds/backbone.py b/modelscope/models/nlp/dgds/backbone.py
index 17e3c5746..9acf3937f 100644
--- a/modelscope/models/nlp/dgds/backbone.py
+++ b/modelscope/models/nlp/dgds/backbone.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/fid_plug/backbone.py b/modelscope/models/nlp/fid_plug/backbone.py
index 5dcddcc15..f86f35fe6 100644
--- a/modelscope/models/nlp/fid_plug/backbone.py
+++ b/modelscope/models/nlp/fid_plug/backbone.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/llama/__init__.py b/modelscope/models/nlp/llama/__init__.py
index d5b6fd19e..9de2d294f 100644
--- a/modelscope/models/nlp/llama/__init__.py
+++ b/modelscope/models/nlp/llama/__init__.py
@@ -1,8 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 from typing import TYPE_CHECKING
 
-from transformers.models.llama import (LlamaConfig, LlamaTokenizer,
-                                       LlamaTokenizerFast)
+from transformers import LlamaTokenizer
+from transformers.models.llama import LlamaConfig, LlamaTokenizerFast
 
 from modelscope.utils.import_utils import LazyImportModule
 
diff --git a/modelscope/models/nlp/llama/backbone.py b/modelscope/models/nlp/llama/backbone.py
index 0ac5bf5cc..dd22da016 100755
--- a/modelscope/models/nlp/llama/backbone.py
+++ b/modelscope/models/nlp/llama/backbone.py
@@ -49,6 +49,7 @@ def _instantiate(cls, **kwargs):
             The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained
         """
         model_dir = kwargs.pop('model_dir', None)
+        device = kwargs.pop('device', None)
         if model_dir is None:
             config = LlamaConfig(**kwargs)
             model = cls(config)
@@ -56,7 +57,8 @@ def _instantiate(cls, **kwargs):
             model = super(MsModelMixin, cls).from_pretrained(
                 pretrained_model_name_or_path=model_dir, **kwargs)
         model.model_dir = model_dir
-        return model
+        return model if 'device_map' in kwargs \
+            or device is None else model.to(device)
 
 
 class LlamaPreTrainedModel(MsModelMixin, LlamaPreTrainedModelHF, TorchModel):
diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
index 079cfd46d..3f717298b 100644
--- a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
+++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py
@@ -58,6 +58,7 @@ def setup_model(args):
     if args.load_pretrained is not None:
         args.no_load_optim = True
         args.load = args.load_pretrained
+        args.no_load_rng = True
         _ = load_checkpoint(model, None, None, args)
 
     return model
diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py
index 28b5cd1ea..8d989820e 100644
--- a/modelscope/models/nlp/mglm/model/modeling_bert.py
+++ b/modelscope/models/nlp/mglm/model/modeling_bert.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
@@ -203,7 +203,7 @@ def __init__(self,
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
                 probabilities.
@@ -212,7 +212,7 @@ def __init__(self,
                 (e.g., 512 or 1024 or 2048).
             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                 `BertModel`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
+            initializer_range: The stdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
         if isinstance(vocab_size_or_config_json_file, str):
@@ -743,7 +743,7 @@ def forward(self, sequence_output, pooled_output):
 
 class PreTrainedBertModel(nn.Module):
     """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
+        a simple interface for downloading and loading pretrained models.
     """
 
     def __init__(self, config, *inputs, **kwargs):
@@ -799,7 +799,7 @@ def from_pretrained(cls,
                     . `bert_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
         """ # noqa
diff --git a/modelscope/models/nlp/mglm/model/transformer.py b/modelscope/models/nlp/mglm/model/transformer.py
index da944c768..c807de87d 100644
--- a/modelscope/models/nlp/mglm/model/transformer.py
+++ b/modelscope/models/nlp/mglm/model/transformer.py
@@ -155,7 +155,7 @@ class ParallelSelfAttention(torch.nn.Module):
     """Parallel self-attention layer for GPT2.
 
     Self-attention layer takes input with size [b, s, h] where b is
-    the batch size, s is the sequence lenght, and h is the hidden size
+    the batch size, s is the sequence length, and h is the hidden size
     and creates output of the same size.
     Arguments:
         hidden_size: total hidden size of the layer (h).
diff --git a/modelscope/models/nlp/palm_v2/text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py
index cd3ecdaf2..a21058fde 100644
--- a/modelscope/models/nlp/palm_v2/text_generation.py
+++ b/modelscope/models/nlp/palm_v2/text_generation.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/plug/backbone.py b/modelscope/models/nlp/plug/backbone.py
index 37714ed77..0442414cb 100644
--- a/modelscope/models/nlp/plug/backbone.py
+++ b/modelscope/models/nlp/plug/backbone.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/modelscope/models/nlp/plug_mental/backbone.py b/modelscope/models/nlp/plug_mental/backbone.py
index e8531f529..918fcdbd9 100755
--- a/modelscope/models/nlp/plug_mental/backbone.py
+++ b/modelscope/models/nlp/plug_mental/backbone.py
@@ -1031,7 +1031,7 @@ def forward(self,
         head_mask = self.get_head_mask(head_mask,
                                        self.config.num_hidden_layers)
 
-        embedding_output, orignal_embeds = self.embeddings(
+        embedding_output, original_embeds = self.embeddings(
             input_ids=input_ids,
             position_ids=position_ids,
             token_type_ids=token_type_ids,
@@ -1065,7 +1065,7 @@ def forward(self,
 
         if not return_dict:
             return (sequence_output,
-                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+                    pooled_output) + encoder_outputs[1:] + (original_embeds, )
 
         return AttentionBackboneModelOutputWithEmbedding(
             last_hidden_state=sequence_output,
@@ -1074,4 +1074,4 @@ def forward(self,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
             cross_attentions=encoder_outputs.cross_attentions,
-            embedding_output=orignal_embeds)
+            embedding_output=original_embeds)
diff --git a/modelscope/models/nlp/space_T_cn/backbone.py b/modelscope/models/nlp/space_T_cn/backbone.py
index b1df58bad..42df1b12b 100644
--- a/modelscope/models/nlp/space_T_cn/backbone.py
+++ b/modelscope/models/nlp/space_T_cn/backbone.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -656,7 +656,7 @@ def from_pretrained(cls,
                     . `bert_config.json` a configuration file for the model
                     . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance
             cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object)
+            state_dict: an optional state dictionary (collections.OrderedDict object)
                 to use instead of Google pre-trained models
             *inputs, **kwargs: additional input for the specific Bert class
                 (ex: num_labels for BertForSequenceClassification)
diff --git a/modelscope/models/nlp/space_T_cn/configuration.py b/modelscope/models/nlp/space_T_cn/configuration.py
index e698b310d..0d39c90ed 100644
--- a/modelscope/models/nlp/space_T_cn/configuration.py
+++ b/modelscope/models/nlp/space_T_cn/configuration.py
@@ -1,5 +1,5 @@
 # Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved.
-# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -52,7 +52,7 @@ def __init__(self,
                 layer in the Transformer encoder.
             hidden_act: The non-linear activation function (function or string) in the
                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+            hidden_dropout_prob: The dropout probability for all fully connected
                 layers in the embeddings, encoder, and pooler.
             attention_probs_dropout_prob: The dropout ratio for the attention
                 probabilities.
@@ -60,7 +60,7 @@ def __init__(self,
                 ever be used with. Typically set this to something large just in case
                 (e.g., 512 or 1024 or 2048).
             type_vocab_size: The vocabulary size of the `token_type_ids` passed into `SpaceTCnConfig`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
+            initializer_range: The stdev of the truncated_normal_initializer for
                 initializing all weight matrices.
         """
         if isinstance(vocab_size_or_config_json_file, str):
diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py
index 58d324a8d..d1998e984 100755
--- a/modelscope/models/nlp/structbert/backbone.py
+++ b/modelscope/models/nlp/structbert/backbone.py
@@ -881,7 +881,7 @@ def forward(self,
         head_mask = self.get_head_mask(head_mask,
                                        self.config.num_hidden_layers)
 
-        embedding_output, orignal_embeds = self.embeddings(
+        embedding_output, original_embeds = self.embeddings(
             input_ids=input_ids,
             position_ids=position_ids,
             token_type_ids=token_type_ids,
@@ -907,7 +907,7 @@ def forward(self,
 
         if not return_dict:
             return (sequence_output,
-                    pooled_output) + encoder_outputs[1:] + (orignal_embeds, )
+                    pooled_output) + encoder_outputs[1:] + (original_embeds, )
 
         return AttentionBackboneModelOutputWithEmbedding(
             last_hidden_state=sequence_output,
@@ -916,4 +916,4 @@ def forward(self,
             hidden_states=encoder_outputs.hidden_states,
             attentions=encoder_outputs.attentions,
             cross_attentions=encoder_outputs.cross_attentions,
-            embedding_output=orignal_embeds)
+            embedding_output=original_embeds)
diff --git a/modelscope/models/nlp/structbert/faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py
index bc22ab617..6c05bcff8 100644
--- a/modelscope/models/nlp/structbert/faq_question_answering.py
+++ b/modelscope/models/nlp/structbert/faq_question_answering.py
@@ -375,6 +375,8 @@ def sentence_embedding(self, inputs: Dict[str, Tensor]):
             input_ids = torch.IntTensor(input_ids)
         if not isinstance(input_mask, Tensor):
             input_mask = torch.IntTensor(input_mask)
+        input_ids = input_ids.to(self.bert.device)
+        input_mask = input_mask.to(self.bert.device)
         rst = self.bert(input_ids, input_mask)
         last_hidden_states = rst.last_hidden_state
         if len(input_mask.shape) == 2:
diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py
index 70200e446..534a05008 100644
--- a/modelscope/msdatasets/__init__.py
+++ b/modelscope/msdatasets/__init__.py
@@ -1,2 +1,2 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .ms_dataset import MsDataset
+from modelscope.msdatasets.ms_dataset import MsDataset
diff --git a/modelscope/msdatasets/context/dataset_context_config.py b/modelscope/msdatasets/context/dataset_context_config.py
index 48124d786..fce262b02 100644
--- a/modelscope/msdatasets/context/dataset_context_config.py
+++ b/modelscope/msdatasets/context/dataset_context_config.py
@@ -17,7 +17,8 @@ def __init__(self, dataset_name: Union[str, list], namespace: str,
                  data_files: Union[str, Sequence[str],
                                    Mapping[str, Union[str, Sequence[str]]]],
                  download_mode: DownloadMode, cache_root_dir: str,
-                 use_streaming: bool, stream_batch_size: int, **kwargs):
+                 use_streaming: bool, stream_batch_size: int,
+                 trust_remote_code: bool, **kwargs):
 
         self._download_config = None
         self._data_meta_config = None
@@ -44,6 +45,7 @@ def __init__(self, dataset_name: Union[str, list], namespace: str,
         self.use_streaming = use_streaming
         self.stream_batch_size = stream_batch_size
         self.download_virgo_files: bool = False
+        self.trust_remote_code: bool = trust_remote_code
 
     @property
     def config_kwargs(self) -> dict:
diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py
index f29acc8fc..920744499 100644
--- a/modelscope/msdatasets/data_loader/data_loader.py
+++ b/modelscope/msdatasets/data_loader/data_loader.py
@@ -127,6 +127,7 @@ def _prepare_and_download(self) -> None:
         cache_dir = self.dataset_context_config.cache_root_dir
         download_mode = self.dataset_context_config.download_mode
         input_kwargs = self.dataset_context_config.config_kwargs
+        trust_remote_code = self.dataset_context_config.trust_remote_code
 
         if self.builder is None and not dataset_py_script:
             raise f'meta-file: {dataset_name}.py not found on the modelscope hub.'
@@ -141,7 +142,7 @@ def _prepare_and_download(self) -> None:
                 data_files=data_files,
                 cache_dir=cache_dir,
                 download_mode=download_mode.value,
-                ignore_verifications=True,
+                trust_remote_code=trust_remote_code,
                 **input_kwargs)
         else:
             self.dataset = self.data_files_manager.fetch_data_files(
diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py
index 0dec5d89c..a9e58b7c4 100644
--- a/modelscope/msdatasets/data_loader/data_loader_manager.py
+++ b/modelscope/msdatasets/data_loader/data_loader_manager.py
@@ -57,6 +57,7 @@ def load_dataset(self, data_loader_type: enum.Enum):
         cache_root_dir = self.dataset_context_config.cache_root_dir
         download_mode = self.dataset_context_config.download_mode
         use_streaming = self.dataset_context_config.use_streaming
+        trust_remote_code = self.dataset_context_config.trust_remote_code
         input_config_kwargs = self.dataset_context_config.config_kwargs
 
         # load local single file
@@ -81,7 +82,7 @@ def load_dataset(self, data_loader_type: enum.Enum):
                 cache_dir=cache_root_dir,
                 download_mode=download_mode.value,
                 streaming=use_streaming,
-                ignore_verifications=True,
+                trust_remote_code=trust_remote_code,
                 **input_config_kwargs)
         raise f'Expected local data loader type: {LocalDataLoaderType.HF_DATA_LOADER.value}.'
 
@@ -105,6 +106,7 @@ def load_dataset(self, data_loader_type: enum.Enum):
         download_mode_val = self.dataset_context_config.download_mode.value
         use_streaming = self.dataset_context_config.use_streaming
         input_config_kwargs = self.dataset_context_config.config_kwargs
+        trust_remote_code = self.dataset_context_config.trust_remote_code
 
         # To use the huggingface data loader
         if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER:
@@ -117,7 +119,7 @@ def load_dataset(self, data_loader_type: enum.Enum):
                 data_files=data_files,
                 download_mode=download_mode_val,
                 streaming=use_streaming,
-                ignore_verifications=True,
+                trust_remote_code=trust_remote_code,
                 **input_config_kwargs)
             # download statistics
             self.api.dataset_download_statistics(
diff --git a/modelscope/msdatasets/dataset_cls/dataset.py b/modelscope/msdatasets/dataset_cls/dataset.py
index f9ffd9a72..9c1c75841 100644
--- a/modelscope/msdatasets/dataset_cls/dataset.py
+++ b/modelscope/msdatasets/dataset_cls/dataset.py
@@ -149,6 +149,7 @@ def _download_item(self, item):
                         if isinstance(ex_cache_path, str):
                             ex_cache_path = [ex_cache_path]
                         ret[k] = ex_cache_path
+                        ret[k.strip(':FILE')] = v
 
             except Exception as e:
                 logger.error(e)
diff --git a/modelscope/msdatasets/download/dataset_builder.py b/modelscope/msdatasets/download/dataset_builder.py
index 0c5c41543..845636682 100644
--- a/modelscope/msdatasets/download/dataset_builder.py
+++ b/modelscope/msdatasets/download/dataset_builder.py
@@ -330,6 +330,7 @@ def __init__(self, dataset_context_config: DatasetContextConfig):
 
         super().__init__(
             cache_dir=self.cache_build_dir,
+            dataset_name=self.dataset_name,
             config_name=self.namespace,
             hash=sub_dir_hash,
             data_files=None,  # TODO: self.meta_data_files,
diff --git a/modelscope/msdatasets/download/download_config.py b/modelscope/msdatasets/download/download_config.py
index 11118f85c..0fc95cd9a 100644
--- a/modelscope/msdatasets/download/download_config.py
+++ b/modelscope/msdatasets/download/download_config.py
@@ -6,16 +6,18 @@
 
 
 class DataDownloadConfig(DownloadConfig):
+    """
+    Extends `DownloadConfig` with additional attributes for data download.
+    """
 
-    def __init__(self):
-        self.dataset_name: Optional[str] = None
-        self.namespace: Optional[str] = None
-        self.version: Optional[str] = None
-        self.split: Optional[Union[str, list]] = None
-        self.data_dir: Optional[str] = None
-        self.oss_config: Optional[dict] = {}
-        self.meta_args_map: Optional[dict] = {}
-        self.num_proc: int = 4
+    dataset_name: Optional[str] = None
+    namespace: Optional[str] = None
+    version: Optional[str] = None
+    split: Optional[Union[str, list]] = None
+    data_dir: Optional[str] = None
+    oss_config: Optional[dict] = {}
+    meta_args_map: Optional[dict] = {}
+    num_proc: int = 4
 
     def copy(self) -> 'DataDownloadConfig':
         return self
diff --git a/modelscope/msdatasets/download/download_manager.py b/modelscope/msdatasets/download/download_manager.py
index 4799171aa..5e36cdce6 100644
--- a/modelscope/msdatasets/download/download_manager.py
+++ b/modelscope/msdatasets/download/download_manager.py
@@ -36,6 +36,11 @@ def _download(self, url_or_filename: str,
             return cached_path(
                 url_or_filename, download_config=download_config)
 
+    def _download_single(self, url_or_filename: str,
+                         download_config: DataDownloadConfig) -> str:
+        # Note: _download_single function is available for datasets>=2.19.0
+        return self._download(url_or_filename, download_config)
+
 
 class DataStreamingDownloadManager(StreamingDownloadManager):
     """The data streaming download manager."""
@@ -62,3 +67,7 @@ def _download(self, url_or_filename: str) -> str:
         else:
             return cached_path(
                 url_or_filename, download_config=self.download_config)
+
+    def _download_single(self, url_or_filename: str) -> str:
+        # Note: _download_single function is available for datasets>=2.19.0
+        return self._download(url_or_filename)
diff --git a/modelscope/msdatasets/meta/data_meta_manager.py b/modelscope/msdatasets/meta/data_meta_manager.py
index 3f1e65726..e5a57f026 100644
--- a/modelscope/msdatasets/meta/data_meta_manager.py
+++ b/modelscope/msdatasets/meta/data_meta_manager.py
@@ -92,6 +92,10 @@ def fetch_meta_files(self) -> None:
         data_meta_config.meta_cache_dir = meta_cache_dir
         data_meta_config.dataset_scripts = dataset_scripts
         data_meta_config.dataset_formation = dataset_formation
+        if '.py' in dataset_scripts:
+            tmp_py_scripts = dataset_scripts['.py']
+            if len(tmp_py_scripts) > 0:
+                data_meta_config.dataset_py_script = tmp_py_scripts[0]
 
         # Set dataset_context_config
         self.dataset_context_config.data_meta_config = data_meta_config
@@ -112,7 +116,9 @@ def parse_dataset_structure(self):
         dataset_py_script = None
         dataset_scripts = data_meta_config.dataset_scripts
         if not dataset_scripts or len(dataset_scripts) == 0:
-            raise 'Cannot find dataset meta-files, please fetch meta from modelscope hub.'
+            raise FileNotFoundError(
+                'Cannot find dataset meta-files, please fetch meta from modelscope hub.'
+            )
         if '.py' in dataset_scripts:
             dataset_py_script = dataset_scripts['.py'][0]
         for json_path in dataset_scripts['.json']:
@@ -121,7 +127,9 @@ def parse_dataset_structure(self):
                     dataset_json = json.load(dataset_json_file)
                 break
         if not dataset_json and not dataset_py_script:
-            raise f'File {dataset_name}.json and {dataset_name}.py not found, please specify at least one meta-file.'
+            raise FileNotFoundError(
+                f'File {dataset_name}.json and {dataset_name}.py not found,'
+                'please specify at least one meta-file.')
 
         # Parse meta and get dataset structure
         if dataset_py_script:
diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py
index b720ada62..899142adc 100644
--- a/modelscope/msdatasets/ms_dataset.py
+++ b/modelscope/msdatasets/ms_dataset.py
@@ -13,7 +13,6 @@
 from modelscope.hub.repository import DatasetRepository
 from modelscope.msdatasets.context.dataset_context_config import \
     DatasetContextConfig
-from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
 from modelscope.msdatasets.data_loader.data_loader_manager import (
     LocalDataLoaderManager, LocalDataLoaderType, RemoteDataLoaderManager,
     RemoteDataLoaderType)
@@ -22,14 +21,15 @@
 from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \
     build_custom_dataset
 from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager
+from modelscope.msdatasets.utils.hf_datasets_util import load_dataset_with_ctx
 from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager
 from modelscope.preprocessors import build_preprocessor
 from modelscope.utils.config import Config, ConfigDict
 from modelscope.utils.config_ds import MS_DATASETS_CACHE
 from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE,
                                        DEFAULT_DATASET_REVISION, ConfigFields,
-                                       DownloadMode, Hubs, ModeKeys, Tasks,
-                                       UploadMode, VirgoDatasetConfig)
+                                       DatasetFormations, DownloadMode, Hubs,
+                                       ModeKeys, Tasks, UploadMode)
 from modelscope.utils.import_utils import is_tf_available, is_torch_available
 from modelscope.utils.logger import get_logger
 
@@ -167,6 +167,8 @@ def load(
         stream_batch_size: Optional[int] = 1,
         custom_cfg: Optional[Config] = Config(),
         token: Optional[str] = None,
+        dataset_info_only: Optional[bool] = False,
+        trust_remote_code: Optional[bool] = True,
         **config_kwargs,
     ) -> Union[dict, 'MsDataset', NativeIterableDataset]:
         """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset.
@@ -196,6 +198,8 @@ def load(
                 custom_cfg (str, Optional): Model configuration, this can be used for custom datasets.
                                            see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3
                 token (str, Optional): SDK token of ModelScope.
+                dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict).
+                trust_remote_code (bool, Optional): If set to True, trust the remote code.
                 **config_kwargs (additional keyword arguments): Keyword arguments to be passed
 
             Returns:
@@ -248,6 +252,7 @@ def load(
             cache_root_dir=cache_dir,
             use_streaming=use_streaming,
             stream_batch_size=stream_batch_size,
+            trust_remote_code=trust_remote_code,
             **config_kwargs)
 
         # Load from local disk
@@ -266,32 +271,66 @@ def load(
             return dataset_inst
         # Load from the huggingface hub
         elif hub == Hubs.huggingface:
-            dataset_inst = RemoteDataLoaderManager(
-                dataset_context_config).load_dataset(
-                    RemoteDataLoaderType.HF_DATA_LOADER)
-            dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target)
-            if isinstance(dataset_inst, MsDataset):
-                dataset_inst._dataset_context_config = dataset_context_config
-                if custom_cfg:
-                    dataset_inst.to_custom_dataset(
-                        custom_cfg=custom_cfg, **config_kwargs)
-                    dataset_inst.is_custom = True
-            return dataset_inst
+            from datasets import load_dataset
+            return load_dataset(
+                dataset_name,
+                name=subset_name,
+                split=split,
+                streaming=use_streaming,
+                download_mode=download_mode.value,
+                trust_remote_code=trust_remote_code,
+                **config_kwargs)
+
         # Load from the modelscope hub
         elif hub == Hubs.modelscope:
-            remote_dataloader_manager = RemoteDataLoaderManager(
-                dataset_context_config)
-            dataset_inst = remote_dataloader_manager.load_dataset(
-                RemoteDataLoaderType.MS_DATA_LOADER)
-            dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target)
-            if isinstance(dataset_inst, MsDataset):
-                dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config
-                if custom_cfg:
-                    dataset_inst.to_custom_dataset(
-                        custom_cfg=custom_cfg, **config_kwargs)
-                    dataset_inst.is_custom = True
-            return dataset_inst
+
+            # Get dataset type from ModelScope Hub;  dataset_type->4: General Dataset
+            from modelscope.hub.api import HubApi
+            _api = HubApi()
+            dataset_id_on_hub, dataset_type = _api.get_dataset_id_and_type(
+                dataset_name=dataset_name, namespace=namespace)
+
+            # Load from the ModelScope Hub for type=4 (general)
+            if str(dataset_type) == str(DatasetFormations.general.value):
+
+                with load_dataset_with_ctx(
+                        path=namespace + '/' + dataset_name,
+                        name=subset_name,
+                        data_dir=data_dir,
+                        data_files=data_files,
+                        split=split,
+                        cache_dir=cache_dir,
+                        features=None,
+                        download_config=None,
+                        download_mode=download_mode.value,
+                        revision=version,
+                        token=token,
+                        streaming=use_streaming,
+                        dataset_info_only=dataset_info_only,
+                        trust_remote_code=trust_remote_code,
+                        **config_kwargs) as dataset_res:
+
+                    return dataset_res
+
+            else:
+
+                remote_dataloader_manager = RemoteDataLoaderManager(
+                    dataset_context_config)
+                dataset_inst = remote_dataloader_manager.load_dataset(
+                    RemoteDataLoaderType.MS_DATA_LOADER)
+                dataset_inst = MsDataset.to_ms_dataset(
+                    dataset_inst, target=target)
+                if isinstance(dataset_inst, MsDataset):
+                    dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config
+                    if custom_cfg:
+                        dataset_inst.to_custom_dataset(
+                            custom_cfg=custom_cfg, **config_kwargs)
+                        dataset_inst.is_custom = True
+                return dataset_inst
+
         elif hub == Hubs.virgo:
+            from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader
+            from modelscope.utils.constant import VirgoDatasetConfig
             # Rewrite the namespace, version and cache_dir for virgo dataset.
             if namespace == DEFAULT_DATASET_NAMESPACE:
                 dataset_context_config.namespace = VirgoDatasetConfig.default_virgo_namespace
@@ -323,6 +362,10 @@ def upload(
             chunksize: Optional[int] = 1,
             filter_hidden_files: Optional[bool] = True,
             upload_mode: Optional[UploadMode] = UploadMode.OVERWRITE) -> None:
+        r"""
+        @deprecated
+        This method is deprecated and may be removed in future releases, please use git command line instead.
+        """
         """Upload dataset file or directory to the ModelScope Hub. Please log in to the ModelScope Hub first.
 
         Args:
@@ -346,6 +389,10 @@ def upload(
             None
 
         """
+        warnings.warn(
+            'upload is deprecated, please use git command line to upload the dataset.',
+            DeprecationWarning)
+
         if not object_name:
             raise ValueError('object_name cannot be empty!')
 
@@ -393,6 +440,10 @@ def clone_meta(dataset_work_dir: str,
             None
         """
 
+        warnings.warn(
+            'upload is deprecated, please use git command line to upload the dataset.',
+            DeprecationWarning)
+
         _repo = DatasetRepository(
             repo_work_dir=dataset_work_dir,
             dataset_id=dataset_id,
diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py
index b40915eb8..960693c17 100644
--- a/modelscope/msdatasets/utils/dataset_utils.py
+++ b/modelscope/msdatasets/utils/dataset_utils.py
@@ -195,7 +195,7 @@ def get_dataset_files(subset_split_into: dict,
 
     for split, info in subset_split_into.items():
         custom_type_map[split] = info.get('custom', '')
-        meta_map[split] = modelscope_api.get_dataset_file_url(
+        meta_map[split] = modelscope_api.get_dataset_file_url_origin(
             info.get('meta', ''), dataset_name, namespace, revision)
         if info.get('file'):
             file_map[split] = info['file']
@@ -212,7 +212,10 @@ def get_dataset_files(subset_split_into: dict,
 
             csv_delimiter = context_config.config_kwargs.get('delimiter', ',')
             csv_df = pd.read_csv(
-                meta_csv_file_path, iterator=False, delimiter=csv_delimiter)
+                meta_csv_file_path,
+                iterator=False,
+                delimiter=csv_delimiter,
+                escapechar='\\')
             target_col = csv_df.columns[csv_df.columns.str.contains(
                 ':FILE')].to_list()
             if len(target_col) == 0:
diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py
new file mode 100644
index 000000000..8bd768dc1
--- /dev/null
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -0,0 +1,1381 @@
+# noqa: isort:skip_file, yapf: disable
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+import importlib
+import contextlib
+import os
+import warnings
+from functools import partial
+from pathlib import Path
+from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Union, Tuple, Literal
+
+from urllib.parse import urlencode
+
+import requests
+from datasets import (BuilderConfig, Dataset, DatasetBuilder, DatasetDict,
+                      DownloadConfig, DownloadManager, DownloadMode, Features,
+                      IterableDataset, IterableDatasetDict, Split,
+                      VerificationMode, Version, config, data_files)
+from datasets.data_files import (
+    FILES_TO_IGNORE, DataFilesDict, DataFilesList, EmptyDatasetError,
+    _get_data_files_patterns, _is_inside_unrequested_special_dir,
+    _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, get_metadata_patterns, sanitize_patterns)
+from datasets.download.streaming_download_manager import (
+    _prepare_path_and_storage_options, xbasename, xjoin)
+from datasets.exceptions import DataFilesNotFoundError, DatasetNotFoundError
+from datasets.info import DatasetInfosDict
+from datasets.load import (
+    ALL_ALLOWED_EXTENSIONS, BuilderConfigsParameters,
+    CachedDatasetModuleFactory, DatasetModule,
+    HubDatasetModuleFactoryWithoutScript,
+    HubDatasetModuleFactoryWithParquetExport,
+    HubDatasetModuleFactoryWithScript, LocalDatasetModuleFactoryWithoutScript,
+    LocalDatasetModuleFactoryWithScript, PackagedDatasetModuleFactory,
+    create_builder_configs_from_metadata_configs, get_dataset_builder_class,
+    import_main_class, infer_module_for_data_files, files_to_hash,
+    _get_importable_file_path, resolve_trust_remote_code, _create_importable_file, _load_importable_file,
+    init_dynamic_modules)
+from datasets.naming import camelcase_to_snakecase
+from datasets.packaged_modules import (_EXTENSION_TO_MODULE,
+                                       _MODULE_SUPPORTS_METADATA,
+                                       _MODULE_TO_EXTENSIONS,
+                                       _PACKAGED_DATASETS_MODULES)
+from datasets.utils import file_utils
+from datasets.utils.file_utils import (OfflineModeIsEnabled,
+                                       _raise_if_offline_mode_is_enabled,
+                                       cached_path, is_local_path,
+                                       is_relative_path,
+                                       relative_to_absolute_path)
+from datasets.utils.info_utils import is_small_dataset
+from datasets.utils.metadata import MetadataConfigs
+from datasets.utils.py_utils import get_imports, map_nested
+from datasets.utils.track import tracked_str
+from fsspec import filesystem
+from fsspec.core import _un_chain
+from fsspec.utils import stringify_path
+from huggingface_hub import (DatasetCard, DatasetCardData)
+from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo
+from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder
+from packaging import version
+
+from modelscope import HubApi
+from modelscope.hub.utils.utils import get_endpoint
+from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms
+from modelscope.utils.config_ds import MS_DATASETS_CACHE
+from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+ExpandDatasetProperty_T = Literal[
+    'author',
+    'cardData',
+    'citation',
+    'createdAt',
+    'disabled',
+    'description',
+    'downloads',
+    'downloadsAllTime',
+    'gated',
+    'lastModified',
+    'likes',
+    'paperswithcode_id',
+    'private',
+    'siblings',
+    'sha',
+    'tags',
+]
+
+
+def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> str:
+    url_or_filename = str(url_or_filename)
+    # for temp val
+    revision = None
+    if url_or_filename.startswith('hf://'):
+        revision, url_or_filename = url_or_filename.split('@', 1)[-1].split('/', 1)
+    if is_relative_path(url_or_filename):
+        # append the relative path to the base_path
+        # url_or_filename = url_or_path_join(self._base_path, url_or_filename)
+        revision = revision or 'master'
+        # Note: make sure the FilePath is the last param
+        params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename}
+        params: str = urlencode(params)
+        url_or_filename = self._base_path + params
+
+    out = cached_path(url_or_filename, download_config=download_config)
+    out = tracked_str(out)
+    out.set_origin(url_or_filename)
+    return out
+
+
+def _dataset_info(
+    self,
+    repo_id: str,
+    *,
+    revision: Optional[str] = None,
+    timeout: Optional[float] = None,
+    files_metadata: bool = False,
+    token: Optional[Union[bool, str]] = None,
+    expand: Optional[List[ExpandDatasetProperty_T]] = None,
+) -> HfDatasetInfo:
+    """
+    Get info on one specific dataset on huggingface.co.
+
+    Dataset can be private if you pass an acceptable token.
+
+    Args:
+        repo_id (`str`):
+            A namespace (user or an organization) and a repo name separated
+            by a `/`.
+        revision (`str`, *optional*):
+            The revision of the dataset repository from which to get the
+            information.
+        timeout (`float`, *optional*):
+            Whether to set a timeout for the request to the Hub.
+        files_metadata (`bool`, *optional*):
+            Whether or not to retrieve metadata for files in the repository
+            (size, LFS metadata, etc). Defaults to `False`.
+        token (`bool` or `str`, *optional*):
+            A valid authentication token (see https://huggingface.co/settings/token).
+            If `None` or `True` and machine is logged in (through `huggingface-cli login`
+            or [`~huggingface_hub.login`]), token will be retrieved from the cache.
+            If `False`, token is not sent in the request header.
+
+    Returns:
+        [`hf_api.DatasetInfo`]: The dataset repository information.
+
+    <Tip>
+
+    Raises the following errors:
+
+        - [`~utils.RepositoryNotFoundError`]
+          If the repository to download from cannot be found. This may be because it doesn't exist,
+          or because it is set to `private` and you do not have access.
+        - [`~utils.RevisionNotFoundError`]
+          If the revision to download from cannot be found.
+
+    </Tip>
+    """
+    _api = HubApi()
+    _namespace, _dataset_name = repo_id.split('/')
+    dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
+        dataset_name=_dataset_name, namespace=_namespace)
+
+    revision: str = revision or 'master'
+    data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
+                                  revision=revision,
+                                  files_metadata=files_metadata,
+                                  timeout=timeout)
+
+    # Parse data
+    data_d: dict = data['Data']
+    data_file_list: list = data_d['Files']
+    # commit_info: dict = data_d['LatestCommitter']
+
+    # Update data   # TODO: columns align with HfDatasetInfo
+    data['id'] = repo_id
+    data['private'] = False
+    data['author'] = repo_id.split('/')[0] if repo_id else None
+    data['sha'] = revision
+    data['lastModified'] = None
+    data['gated'] = False
+    data['disabled'] = False
+    data['downloads'] = 0
+    data['likes'] = 0
+    data['tags'] = []
+    data['cardData'] = []
+    data['createdAt'] = None
+
+    # e.g. {'rfilename': 'xxx', 'blobId': 'xxx', 'size': 0, 'lfs': {'size': 0, 'sha256': 'xxx', 'pointerSize': 0}}
+    data['siblings'] = []
+    for file_info_d in data_file_list:
+        file_info = {
+            'rfilename': file_info_d['Path'],
+            'blobId': file_info_d['Id'],
+            'size': file_info_d['Size'],
+            'type': 'directory' if file_info_d['Type'] == 'tree' else 'file',
+            'lfs': {
+                'size': file_info_d['Size'],
+                'sha256': file_info_d['Sha256'],
+                'pointerSize': 0
+            }
+        }
+        data['siblings'].append(file_info)
+
+    return HfDatasetInfo(**data)
+
+
+def _list_repo_tree(
+    self,
+    repo_id: str,
+    path_in_repo: Optional[str] = None,
+    *,
+    recursive: bool = True,
+    expand: bool = False,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    token: Optional[Union[bool, str]] = None,
+) -> Iterable[Union[RepoFile, RepoFolder]]:
+
+    _api = HubApi(timeout=3 * 60, max_retries=3)
+
+    if is_relative_path(repo_id) and repo_id.count('/') == 1:
+        _namespace, _dataset_name = repo_id.split('/')
+    elif is_relative_path(repo_id) and repo_id.count('/') == 0:
+        logger.warning(f'Got a relative path: {repo_id} without namespace, '
+                       f'Use default namespace: {DEFAULT_DATASET_NAMESPACE}')
+        _namespace, _dataset_name = DEFAULT_DATASET_NAMESPACE, repo_id
+    else:
+        raise ValueError(f'Invalid repo_id: {repo_id} !')
+
+    page_number = 1
+    page_size = 100
+    while True:
+        data: dict = _api.list_repo_tree(dataset_name=_dataset_name,
+                                         namespace=_namespace,
+                                         revision=revision or 'master',
+                                         root_path=path_in_repo or None,
+                                         recursive=True,
+                                         page_number=page_number,
+                                         page_size=page_size,
+                                         )
+        if not ('Code' in data and data['Code'] == 200):
+            logger.error(f'Get dataset: {repo_id} file list failed, message: {data["Message"]}')
+            return None
+
+        # Parse data (Type: 'tree' or 'blob')
+        data_file_list: list = data['Data']['Files']
+
+        for file_info_d in data_file_list:
+            path_info = {}
+            path_info['type'] = 'directory' if file_info_d['Type'] == 'tree' else 'file'
+            path_info['path'] = file_info_d['Path']
+            path_info['size'] = file_info_d['Size']
+            path_info['oid'] = file_info_d['Sha256']
+
+            yield RepoFile(**path_info) if path_info['type'] == 'file' else RepoFolder(**path_info)
+
+        if len(data_file_list) < page_size:
+            break
+        page_number += 1
+
+
+def _get_paths_info(
+    self,
+    repo_id: str,
+    paths: Union[List[str], str],
+    *,
+    expand: bool = False,
+    revision: Optional[str] = None,
+    repo_type: Optional[str] = None,
+    token: Optional[Union[bool, str]] = None,
+) -> List[Union[RepoFile, RepoFolder]]:
+
+    _api = HubApi()
+    _namespace, _dataset_name = repo_id.split('/')
+    dataset_hub_id, dataset_type = _api.get_dataset_id_and_type(
+        dataset_name=_dataset_name, namespace=_namespace)
+
+    revision: str = revision or 'master'
+    data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id,
+                                  revision=revision,
+                                  files_metadata=False,
+                                  recursive='False')
+    data_d: dict = data['Data']
+    data_file_list: list = data_d['Files']
+
+    return [
+        RepoFile(path=item_d['Name'],
+                 size=item_d['Size'],
+                 oid=item_d['Revision'],
+                 lfs=None,           # TODO: lfs type to be supported
+                 last_commit=None,   # TODO: lfs type to be supported
+                 security=None
+                 ) for item_d in data_file_list if item_d['Name'] == 'README.md'
+    ]
+
+
+def get_fs_token_paths(
+    urlpath,
+    storage_options=None,
+    protocol=None,
+):
+    if isinstance(urlpath, (list, tuple, set)):
+        if not urlpath:
+            raise ValueError('empty urlpath sequence')
+        urlpath0 = stringify_path(list(urlpath)[0])
+    else:
+        urlpath0 = stringify_path(urlpath)
+    storage_options = storage_options or {}
+    if protocol:
+        storage_options['protocol'] = protocol
+    chain = _un_chain(urlpath0, storage_options or {})
+    inkwargs = {}
+    # Reverse iterate the chain, creating a nested target_* structure
+    for i, ch in enumerate(reversed(chain)):
+        urls, nested_protocol, kw = ch
+        if i == len(chain) - 1:
+            inkwargs = dict(**kw, **inkwargs)
+            continue
+        inkwargs['target_options'] = dict(**kw, **inkwargs)
+        inkwargs['target_protocol'] = nested_protocol
+        inkwargs['fo'] = urls
+    paths, protocol, _ = chain[0]
+    fs = filesystem(protocol, **inkwargs)
+
+    return fs
+
+
+def _resolve_pattern(
+    pattern: str,
+    base_path: str,
+    allowed_extensions: Optional[List[str]] = None,
+    download_config: Optional[DownloadConfig] = None,
+) -> List[str]:
+    """
+    Resolve the paths and URLs of the data files from the pattern passed by the user.
+
+    You can use patterns to resolve multiple local files. Here are a few examples:
+    - *.csv to match all the CSV files at the first level
+    - **.csv to match all the CSV files at any level
+    - data/* to match all the files inside "data"
+    - data/** to match all the files inside "data" and its subdirectories
+
+    The patterns are resolved using the fsspec glob.
+
+    glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /.
+    For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix,  # noqa: E501
+    resulting in **.json being equivalent to **/*.json.
+
+    More generally:
+    - '*' matches any character except a forward-slash (to match just the file or directory name)
+    - '**' matches any character including a forward-slash /
+
+    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
+    The same applies to special directories that start with a double underscore like "__pycache__".
+    You can still include one if the pattern explicilty mentions it:
+    - to include a hidden file: "*/.hidden.txt" or "*/.*"
+    - to include a hidden directory: ".hidden/*" or ".*/*"
+    - to include a special directory: "__special__/*" or "__*/*"
+
+    Example::
+
+        >>> from datasets.data_files import resolve_pattern
+        >>> base_path = "."
+        >>> resolve_pattern("docs/**/*.py", base_path)
+        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']
+
+    Args:
+        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
+            The paths can be absolute or relative to base_path.
+            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
+        base_path (str): Base path to use when resolving relative paths.
+        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
+            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
+    Returns:
+        List[str]: List of paths or URLs to the local or remote files that match the patterns.
+    """
+    if is_relative_path(pattern):
+        pattern = xjoin(base_path, pattern)
+    elif is_local_path(pattern):
+        base_path = os.path.splitdrive(pattern)[0] + os.sep
+    else:
+        base_path = ''
+    # storage_options： {'hf': {'token': None, 'endpoint': 'https://huggingface.co'}}
+    pattern, storage_options = _prepare_path_and_storage_options(
+        pattern, download_config=download_config)
+    fs = get_fs_token_paths(pattern, storage_options=storage_options)
+    fs_base_path = base_path.split('::')[0].split('://')[-1] or fs.root_marker
+    fs_pattern = pattern.split('::')[0].split('://')[-1]
+    files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)}
+    protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0]
+    protocol_prefix = protocol + '://' if protocol != 'file' else ''
+    glob_kwargs = {}
+    if protocol == 'hf' and config.HF_HUB_VERSION >= version.parse('0.20.0'):
+        # 10 times faster glob with detail=True (ignores costly info like lastCommit)
+        glob_kwargs['expand_info'] = False
+
+    try:
+        tmp_file_paths = fs.glob(pattern, detail=True, **glob_kwargs)
+    except FileNotFoundError:
+        raise DataFilesNotFoundError(f"Unable to find '{pattern}'")
+
+    matched_paths = [
+        filepath if filepath.startswith(protocol_prefix) else protocol_prefix
+        + filepath for filepath, info in tmp_file_paths.items()
+        if info['type'] == 'file' and (
+            xbasename(filepath) not in files_to_ignore)
+        and not _is_inside_unrequested_special_dir(
+            os.path.relpath(filepath, fs_base_path),
+            os.path.relpath(fs_pattern, fs_base_path)) and  # noqa: W504
+        not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(  # noqa: W504
+            os.path.relpath(filepath, fs_base_path),
+            os.path.relpath(fs_pattern, fs_base_path))
+    ]  # ignore .ipynb and __pycache__, but keep /../
+    if allowed_extensions is not None:
+        out = [
+            filepath for filepath in matched_paths
+            if any('.' + suffix in allowed_extensions
+                   for suffix in xbasename(filepath).split('.')[1:])
+        ]
+        if len(out) < len(matched_paths):
+            invalid_matched_files = list(set(matched_paths) - set(out))
+            logger.info(
+                f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: "
+                f'{invalid_matched_files}')
+    else:
+        out = matched_paths
+    if not out:
+        error_msg = f"Unable to find '{pattern}'"
+        if allowed_extensions is not None:
+            error_msg += f' with any supported extension {list(allowed_extensions)}'
+        raise FileNotFoundError(error_msg)
+    return out
+
+
+def _get_data_patterns(
+        base_path: str,
+        download_config: Optional[DownloadConfig] = None) -> Dict[str,
+                                                                  List[str]]:
+    """
+    Get the default pattern from a directory testing all the supported patterns.
+    The first patterns to return a non-empty list of data files is returned.
+
+    Some examples of supported patterns:
+
+    Input:
+
+        my_dataset_repository/
+        ├── README.md
+        └── dataset.csv
+
+    Output:
+
+        {"train": ["**"]}
+
+    Input:
+
+        my_dataset_repository/
+        ├── README.md
+        ├── train.csv
+        └── test.csv
+
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train.csv
+            └── test.csv
+
+        my_dataset_repository/
+        ├── README.md
+        ├── train_0.csv
+        ├── train_1.csv
+        ├── train_2.csv
+        ├── train_3.csv
+        ├── test_0.csv
+        └── test_1.csv
+
+    Output:
+
+        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**',
+                    'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
+         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**',
+                    'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
+
+    Input:
+
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train/
+            │   ├── shard_0.csv
+            │   ├── shard_1.csv
+            │   ├── shard_2.csv
+            │   └── shard_3.csv
+            └── test/
+                ├── shard_0.csv
+                └── shard_1.csv
+
+    Output:
+
+        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**',
+                'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
+         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**',
+                'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}
+
+    Input:
+
+        my_dataset_repository/
+        ├── README.md
+        └── data/
+            ├── train-00000-of-00003.csv
+            ├── train-00001-of-00003.csv
+            ├── train-00002-of-00003.csv
+            ├── test-00000-of-00001.csv
+            ├── random-00000-of-00003.csv
+            ├── random-00001-of-00003.csv
+            └── random-00002-of-00003.csv
+
+    Output:
+
+        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
+         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}
+
+    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
+    """
+    resolver = partial(
+        _resolve_pattern, base_path=base_path, download_config=download_config)
+    try:
+        return _get_data_files_patterns(resolver)
+    except FileNotFoundError:
+        raise EmptyDatasetError(
+            f"The directory at {base_path} doesn't contain any data files"
+        ) from None
+
+
+def get_module_without_script(self) -> DatasetModule:
+    _ms_api = HubApi()
+    _repo_id: str = self.name
+    _namespace, _dataset_name = _repo_id.split('/')
+
+    # hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info(
+    #     self.name,
+    #     revision=self.revision,
+    #     token=self.download_config.token,
+    #     timeout=100.0,
+    # )
+    # even if metadata_configs is not None (which means that we will resolve files for each config later)
+    # we cannot skip resolving all files because we need to infer module name by files extensions
+    # revision = hfh_dataset_info.sha  # fix the revision in case there are new commits in the meantime
+    revision = self.revision or 'master'
+    base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip(
+        '/')
+
+    download_config = self.download_config.copy()
+    if download_config.download_desc is None:
+        download_config.download_desc = 'Downloading [README.md]'
+    try:
+        url_or_filename = _ms_api.get_dataset_file_url(
+            file_name='README.md',
+            dataset_name=_dataset_name,
+            namespace=_namespace,
+            revision=revision,
+            extension_filter=False,
+        )
+
+        dataset_readme_path = cached_path(
+            url_or_filename=url_or_filename, download_config=download_config)
+        dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data
+    except FileNotFoundError:
+        dataset_card_data = DatasetCardData()
+
+    subset_name: str = download_config.storage_options.get('name', None)
+
+    metadata_configs = MetadataConfigs.from_dataset_card_data(
+        dataset_card_data)
+    dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data)
+    # we need a set of data files to find which dataset builder to use
+    # because we need to infer module name by files extensions
+    if self.data_files is not None:
+        patterns = sanitize_patterns(self.data_files)
+    elif metadata_configs and 'data_files' in next(
+            iter(metadata_configs.values())):
+
+        if subset_name is not None:
+            subset_data_files = metadata_configs[subset_name]['data_files']
+        else:
+            subset_data_files = next(iter(metadata_configs.values()))['data_files']
+        patterns = sanitize_patterns(subset_data_files)
+    else:
+        patterns = _get_data_patterns(
+            base_path, download_config=self.download_config)
+
+    data_files = DataFilesDict.from_patterns(
+        patterns,
+        base_path=base_path,
+        allowed_extensions=ALL_ALLOWED_EXTENSIONS,
+        download_config=self.download_config,
+    )
+    module_name, default_builder_kwargs = infer_module_for_data_files(
+        data_files=data_files,
+        path=self.name,
+        download_config=self.download_config,
+    )
+    data_files = data_files.filter_extensions(
+        _MODULE_TO_EXTENSIONS[module_name])
+    # Collect metadata files if the module supports them
+    supports_metadata = module_name in _MODULE_SUPPORTS_METADATA
+    if self.data_files is None and supports_metadata:
+        try:
+            metadata_patterns = get_metadata_patterns(
+                base_path, download_config=self.download_config)
+        except FileNotFoundError:
+            metadata_patterns = None
+        if metadata_patterns is not None:
+            metadata_data_files_list = DataFilesList.from_patterns(
+                metadata_patterns,
+                download_config=self.download_config,
+                base_path=base_path)
+            if metadata_data_files_list:
+                data_files = DataFilesDict({
+                    split: data_files_list + metadata_data_files_list
+                    for split, data_files_list in data_files.items()
+                })
+
+    module_path, _ = _PACKAGED_DATASETS_MODULES[module_name]
+
+    if metadata_configs:
+        builder_configs, default_config_name = create_builder_configs_from_metadata_configs(
+            module_path,
+            metadata_configs,
+            base_path=base_path,
+            supports_metadata=supports_metadata,
+            default_builder_kwargs=default_builder_kwargs,
+            download_config=self.download_config,
+        )
+    else:
+        builder_configs: List[BuilderConfig] = [
+            import_main_class(module_path).BUILDER_CONFIG_CLASS(
+                data_files=data_files,
+                **default_builder_kwargs,
+            )
+        ]
+        default_config_name = None
+    builder_kwargs = {
+        # "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"),
+        'base_path':
+        _ms_api.get_file_base_path(
+            namespace=_namespace,
+            dataset_name=_dataset_name,
+        ),
+        'repo_id':
+        self.name,
+        'dataset_name':
+        camelcase_to_snakecase(Path(self.name).name),
+        'data_files': data_files,
+    }
+    download_config = self.download_config.copy()
+    if download_config.download_desc is None:
+        download_config.download_desc = 'Downloading metadata'
+
+    # Note: `dataset_infos.json` is deprecated and can cause an error during loading if it exists
+
+    if default_config_name is None and len(dataset_infos) == 1:
+        default_config_name = next(iter(dataset_infos))
+
+    hash = revision
+    return DatasetModule(
+        module_path,
+        hash,
+        builder_kwargs,
+        dataset_infos=dataset_infos,
+        builder_configs_parameters=BuilderConfigsParameters(
+            metadata_configs=metadata_configs,
+            builder_configs=builder_configs,
+            default_config_name=default_config_name,
+        ),
+    )
+
+
+def _download_additional_modules(
+        name: str,
+        dataset_name: str,
+        namespace: str,
+        revision: str,
+        imports: Tuple[str, str, str, str],
+        download_config: Optional[DownloadConfig]
+) -> List[Tuple[str, str]]:
+    """
+    Download additional module for a module <name>.py at URL (or local path) <base_path>/<name>.py
+    The imports must have been parsed first using ``get_imports``.
+
+    If some modules need to be installed with pip, an error is raised showing how to install them.
+    This function return the list of downloaded modules as tuples (import_name, module_file_path).
+
+    The downloaded modules can then be moved into an importable directory
+    with ``_copy_script_and_other_resources_in_importable_dir``.
+    """
+    local_imports = []
+    library_imports = []
+    download_config = download_config.copy()
+    if download_config.download_desc is None:
+        download_config.download_desc = 'Downloading extra modules'
+    for import_type, import_name, import_path, sub_directory in imports:
+        if import_type == 'library':
+            library_imports.append((import_name, import_path))  # Import from a library
+            continue
+
+        if import_name == name:
+            raise ValueError(
+                f'Error in the {name} script, importing relative {import_name} module '
+                f'but {import_name} is the name of the script. '
+                f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' "
+                f'comment pointing to the original relative import file path.'
+            )
+        if import_type == 'internal':
+            _api = HubApi()
+            # url_or_filename = url_or_path_join(base_path, import_path + ".py")
+            file_name = import_path + '.py'
+            url_or_filename = _api.get_dataset_file_url(file_name=file_name,
+                                                        dataset_name=dataset_name,
+                                                        namespace=namespace,
+                                                        revision=revision,)
+        elif import_type == 'external':
+            url_or_filename = import_path
+        else:
+            raise ValueError('Wrong import_type')
+
+        local_import_path = cached_path(
+            url_or_filename,
+            download_config=download_config,
+        )
+        if sub_directory is not None:
+            local_import_path = os.path.join(local_import_path, sub_directory)
+        local_imports.append((import_name, local_import_path))
+
+    # Check library imports
+    needs_to_be_installed = {}
+    for library_import_name, library_import_path in library_imports:
+        try:
+            lib = importlib.import_module(library_import_name)  # noqa F841
+        except ImportError:
+            if library_import_name not in needs_to_be_installed or library_import_path != library_import_name:
+                needs_to_be_installed[library_import_name] = library_import_path
+    if needs_to_be_installed:
+        _dependencies_str = 'dependencies' if len(needs_to_be_installed) > 1 else 'dependency'
+        _them_str = 'them' if len(needs_to_be_installed) > 1 else 'it'
+        if 'sklearn' in needs_to_be_installed.keys():
+            needs_to_be_installed['sklearn'] = 'scikit-learn'
+        if 'Bio' in needs_to_be_installed.keys():
+            needs_to_be_installed['Bio'] = 'biopython'
+        raise ImportError(
+            f'To be able to use {name}, you need to install the following {_dependencies_str}: '
+            f"{', '.join(needs_to_be_installed)}.\nPlease install {_them_str} using 'pip install "
+            f"{' '.join(needs_to_be_installed.values())}' for instance."
+        )
+    return local_imports
+
+
+def get_module_with_script(self) -> DatasetModule:
+
+    _api = HubApi()
+    _dataset_name: str = self.name.split('/')[-1]
+    _namespace: str = self.name.split('/')[0]
+
+    script_file_name = f'{_dataset_name}.py'
+    script_url: str = _api.get_dataset_file_url(
+        file_name=script_file_name,
+        dataset_name=_dataset_name,
+        namespace=_namespace,
+        revision=self.revision,
+        extension_filter=False,
+    )
+    local_script_path = cached_path(
+        url_or_filename=script_url, download_config=self.download_config)
+
+    dataset_infos_path = None
+    # try:
+    #     dataset_infos_url: str = _api.get_dataset_file_url(
+    #         file_name='dataset_infos.json',
+    #         dataset_name=_dataset_name,
+    #         namespace=_namespace,
+    #         revision=self.revision,
+    #         extension_filter=False,
+    #     )
+    #     dataset_infos_path = cached_path(
+    #         url_or_filename=dataset_infos_url, download_config=self.download_config)
+    # except Exception as e:
+    #     logger.info(f'Cannot find dataset_infos.json: {e}')
+    #     dataset_infos_path = None
+
+    dataset_readme_url: str = _api.get_dataset_file_url(
+        file_name='README.md',
+        dataset_name=_dataset_name,
+        namespace=_namespace,
+        revision=self.revision,
+        extension_filter=False,
+    )
+    dataset_readme_path = cached_path(
+        url_or_filename=dataset_readme_url, download_config=self.download_config)
+
+    imports = get_imports(local_script_path)
+    local_imports = _download_additional_modules(
+        name=self.name,
+        dataset_name=_dataset_name,
+        namespace=_namespace,
+        revision=self.revision,
+        imports=imports,
+        download_config=self.download_config,
+    )
+    additional_files = []
+    if dataset_infos_path:
+        additional_files.append((config.DATASETDICT_INFOS_FILENAME, dataset_infos_path))
+    if dataset_readme_path:
+        additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path))
+    # copy the script and the files in an importable directory
+    dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules()
+    hash = files_to_hash([local_script_path] + [loc[1] for loc in local_imports])
+    importable_file_path = _get_importable_file_path(
+        dynamic_modules_path=dynamic_modules_path,
+        module_namespace='datasets',
+        subdirectory_name=hash,
+        name=self.name,
+    )
+    if not os.path.exists(importable_file_path):
+        trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name)
+        if trust_remote_code:
+            _create_importable_file(
+                local_path=local_script_path,
+                local_imports=local_imports,
+                additional_files=additional_files,
+                dynamic_modules_path=dynamic_modules_path,
+                module_namespace='datasets',
+                subdirectory_name=hash,
+                name=self.name,
+                download_mode=self.download_mode,
+            )
+        else:
+            raise ValueError(
+                f'Loading {self.name} requires you to execute the dataset script in that'
+                ' repo on your local machine. Make sure you have read the code there to avoid malicious use, then'
+                ' set the option `trust_remote_code=True` to remove this error.'
+            )
+    module_path, hash = _load_importable_file(
+        dynamic_modules_path=dynamic_modules_path,
+        module_namespace='datasets',
+        subdirectory_name=hash,
+        name=self.name,
+    )
+    # make the new module to be noticed by the import system
+    importlib.invalidate_caches()
+    builder_kwargs = {
+        # "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"),
+        'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name),
+        'repo_id': self.name,
+    }
+    return DatasetModule(module_path, hash, builder_kwargs)
+
+
+class DatasetsWrapperHF:
+
+    @staticmethod
+    def load_dataset(
+        path: str,
+        name: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, Sequence[str],
+                                   Mapping[str, Union[str,
+                                                      Sequence[str]]]]] = None,
+        split: Optional[Union[str, Split]] = None,
+        cache_dir: Optional[str] = None,
+        features: Optional[Features] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+        verification_mode: Optional[Union[VerificationMode, str]] = None,
+        keep_in_memory: Optional[bool] = None,
+        save_infos: bool = False,
+        revision: Optional[Union[str, Version]] = None,
+        token: Optional[Union[bool, str]] = None,
+        use_auth_token='deprecated',
+        task='deprecated',
+        streaming: bool = False,
+        num_proc: Optional[int] = None,
+        storage_options: Optional[Dict] = None,
+        trust_remote_code: bool = True,
+        dataset_info_only: Optional[bool] = False,
+        **config_kwargs,
+    ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset,
+               dict]:
+
+        if use_auth_token != 'deprecated':
+            warnings.warn(
+                "'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
+                "You can remove this warning by passing 'token=<use_auth_token>' instead.",
+                FutureWarning,
+            )
+            token = use_auth_token
+        if task != 'deprecated':
+            warnings.warn(
+                "'task' was deprecated in version 2.13.0 and will be removed in 3.0.0.\n",
+                FutureWarning,
+            )
+        else:
+            task = None
+        if data_files is not None and not data_files:
+            raise ValueError(
+                f"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default)."
+            )
+        if Path(path, config.DATASET_STATE_JSON_FILENAME).exists(
+        ):
+            raise ValueError(
+                'You are trying to load a dataset that was saved using `save_to_disk`. '
+                'Please use `load_from_disk` instead.')
+
+        if streaming and num_proc is not None:
+            raise NotImplementedError(
+                'Loading a streaming dataset in parallel with `num_proc` is not implemented. '
+                'To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader '
+                'using `num_workers` > 1 instead.')
+
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        verification_mode = VerificationMode((
+            verification_mode or VerificationMode.BASIC_CHECKS
+        ) if not save_infos else VerificationMode.ALL_CHECKS)
+
+        # Create a dataset builder
+        builder_instance = DatasetsWrapperHF.load_dataset_builder(
+            path=path,
+            name=name,
+            data_dir=data_dir,
+            data_files=data_files,
+            cache_dir=cache_dir,
+            features=features,
+            download_config=download_config,
+            download_mode=download_mode,
+            revision=revision,
+            token=token,
+            storage_options=storage_options,
+            trust_remote_code=trust_remote_code,
+            _require_default_config_name=name is None,
+            **config_kwargs,
+        )
+
+        # Note: Only for preview mode
+        if dataset_info_only:
+            ret_dict = {}
+            # Get dataset config info from python script
+            if isinstance(path, str) and path.endswith('.py') and os.path.exists(path):
+                from datasets import get_dataset_config_names
+                subset_list = get_dataset_config_names(path)
+                ret_dict = {_subset: [] for _subset in subset_list}
+                return ret_dict
+
+            if builder_instance is None or not hasattr(builder_instance,
+                                                       'builder_configs'):
+                logger.error(f'No builder_configs found for {path} dataset.')
+                return ret_dict
+
+            _tmp_builder_configs = builder_instance.builder_configs
+            for tmp_config_name, tmp_builder_config in _tmp_builder_configs.items():
+                tmp_config_name = str(tmp_config_name)
+                if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None:
+                    ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())]
+                else:
+                    ret_dict[tmp_config_name] = []
+            return ret_dict
+
+        # Return iterable dataset in case of streaming
+        if streaming:
+            return builder_instance.as_streaming_dataset(split=split)
+
+        # Some datasets are already processed on the HF google storage
+        # Don't try downloading from Google storage for the packaged datasets as text, json, csv or pandas
+        # try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES
+
+        # Download and prepare data
+        builder_instance.download_and_prepare(
+            download_config=download_config,
+            download_mode=download_mode,
+            verification_mode=verification_mode,
+            num_proc=num_proc,
+            storage_options=storage_options,
+            # base_path=builder_instance.base_path,
+            # file_format=builder_instance.name or 'arrow',
+        )
+
+        # Build dataset for splits
+        keep_in_memory = (
+            keep_in_memory if keep_in_memory is not None else is_small_dataset(
+                builder_instance.info.dataset_size))
+        ds = builder_instance.as_dataset(
+            split=split,
+            verification_mode=verification_mode,
+            in_memory=keep_in_memory)
+        # Rename and cast features to match task schema
+        if task is not None:
+            # To avoid issuing the same warning twice
+            with warnings.catch_warnings():
+                warnings.simplefilter('ignore', FutureWarning)
+                ds = ds.prepare_for_task(task)
+        if save_infos:
+            builder_instance._save_infos()
+
+        try:
+            _api = HubApi()
+            if is_relative_path(path) and path.count('/') == 1:
+                _namespace, _dataset_name = path.split('/')
+                _api.dataset_download_statistics(dataset_name=_dataset_name, namespace=_namespace)
+        except Exception as e:
+            logger.warning(f'Could not record download statistics: {e}')
+
+        return ds
+
+    @staticmethod
+    def load_dataset_builder(
+        path: str,
+        name: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[str, Sequence[str],
+                                   Mapping[str, Union[str,
+                                                      Sequence[str]]]]] = None,
+        cache_dir: Optional[str] = None,
+        features: Optional[Features] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+        revision: Optional[Union[str, Version]] = None,
+        token: Optional[Union[bool, str]] = None,
+        use_auth_token='deprecated',
+        storage_options: Optional[Dict] = None,
+        trust_remote_code: Optional[bool] = None,
+        _require_default_config_name=True,
+        **config_kwargs,
+    ) -> DatasetBuilder:
+
+        if use_auth_token != 'deprecated':
+            warnings.warn(
+                "'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
+                "You can remove this warning by passing 'token=<use_auth_token>' instead.",
+                FutureWarning,
+            )
+            token = use_auth_token
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        if token is not None:
+            download_config = download_config.copy(
+            ) if download_config else DownloadConfig()
+            download_config.token = token
+        if storage_options is not None:
+            download_config = download_config.copy(
+            ) if download_config else DownloadConfig()
+            download_config.storage_options.update(storage_options)
+
+        dataset_module = DatasetsWrapperHF.dataset_module_factory(
+            path,
+            revision=revision,
+            download_config=download_config,
+            download_mode=download_mode,
+            data_dir=data_dir,
+            data_files=data_files,
+            cache_dir=cache_dir,
+            trust_remote_code=trust_remote_code,
+            _require_default_config_name=_require_default_config_name,
+            _require_custom_configs=bool(config_kwargs),
+            name=name,
+        )
+        # Get dataset builder class from the processing script
+        builder_kwargs = dataset_module.builder_kwargs
+        data_dir = builder_kwargs.pop('data_dir', data_dir)
+        data_files = builder_kwargs.pop('data_files', data_files)
+        config_name = builder_kwargs.pop(
+            'config_name', name
+            or dataset_module.builder_configs_parameters.default_config_name)
+        dataset_name = builder_kwargs.pop('dataset_name', None)
+        info = dataset_module.dataset_infos.get(
+            config_name) if dataset_module.dataset_infos else None
+
+        if (path in _PACKAGED_DATASETS_MODULES and data_files is None
+                and dataset_module.builder_configs_parameters.
+                builder_configs[0].data_files is None):
+            error_msg = f'Please specify the data files or data directory to load for the {path} dataset builder.'
+            example_extensions = [
+                extension for extension in _EXTENSION_TO_MODULE
+                if _EXTENSION_TO_MODULE[extension] == path
+            ]
+            if example_extensions:
+                error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`'
+            raise ValueError(error_msg)
+
+        builder_cls = get_dataset_builder_class(
+            dataset_module, dataset_name=dataset_name)
+
+        builder_instance: DatasetBuilder = builder_cls(
+            cache_dir=cache_dir,
+            dataset_name=dataset_name,
+            config_name=config_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            hash=dataset_module.hash,
+            info=info,
+            features=features,
+            token=token,
+            storage_options=storage_options,
+            **builder_kwargs,  # contains base_path
+            **config_kwargs,
+        )
+        builder_instance._use_legacy_cache_dir_if_possible(dataset_module)
+
+        return builder_instance
+
+    @staticmethod
+    def dataset_module_factory(
+        path: str,
+        revision: Optional[Union[str, Version]] = None,
+        download_config: Optional[DownloadConfig] = None,
+        download_mode: Optional[Union[DownloadMode, str]] = None,
+        dynamic_modules_path: Optional[str] = None,
+        data_dir: Optional[str] = None,
+        data_files: Optional[Union[Dict, List, str, DataFilesDict]] = None,
+        cache_dir: Optional[str] = None,
+        trust_remote_code: Optional[bool] = None,
+        _require_default_config_name=True,
+        _require_custom_configs=False,
+        **download_kwargs,
+    ) -> DatasetModule:
+
+        subset_name: str = download_kwargs.pop('name', None)
+        if download_config is None:
+            download_config = DownloadConfig(**download_kwargs)
+        download_config.storage_options.update({'name': subset_name})
+
+        if download_config and download_config.cache_dir is None:
+            download_config.cache_dir = MS_DATASETS_CACHE
+
+        download_mode = DownloadMode(download_mode
+                                     or DownloadMode.REUSE_DATASET_IF_EXISTS)
+        download_config.extract_compressed_file = True
+        download_config.force_extract = True
+        download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD
+
+        filename = list(
+            filter(lambda x: x,
+                   path.replace(os.sep, '/').split('/')))[-1]
+        if not filename.endswith('.py'):
+            filename = filename + '.py'
+        combined_path = os.path.join(path, filename)
+
+        # We have several ways to get a dataset builder:
+        #
+        # - if path is the name of a packaged dataset module
+        #   -> use the packaged module (json, csv, etc.)
+        #
+        # - if os.path.join(path, name) is a local python file
+        #   -> use the module from the python file
+        # - if path is a local directory (but no python file)
+        #   -> use a packaged module (csv, text etc.) based on content of the directory
+        #
+        # - if path has one "/" and is dataset repository on the HF hub with a python file
+        #   -> the module from the python file in the dataset repository
+        # - if path has one "/" and is dataset repository on the HF hub without a python file
+        #   -> use a packaged module (csv, text etc.) based on content of the repository
+
+        # Try packaged
+        if path in _PACKAGED_DATASETS_MODULES:
+            return PackagedDatasetModuleFactory(
+                path,
+                data_dir=data_dir,
+                data_files=data_files,
+                download_config=download_config,
+                download_mode=download_mode,
+            ).get_module()
+        # Try locally
+        elif path.endswith(filename):
+            if os.path.isfile(path):
+                return LocalDatasetModuleFactoryWithScript(
+                    path,
+                    download_mode=download_mode,
+                    dynamic_modules_path=dynamic_modules_path,
+                    trust_remote_code=trust_remote_code,
+                ).get_module()
+            else:
+                raise FileNotFoundError(
+                    f"Couldn't find a dataset script at {relative_to_absolute_path(path)}"
+                )
+        elif os.path.isfile(combined_path):
+            return LocalDatasetModuleFactoryWithScript(
+                combined_path,
+                download_mode=download_mode,
+                dynamic_modules_path=dynamic_modules_path,
+                trust_remote_code=trust_remote_code,
+            ).get_module()
+        elif os.path.isdir(path):
+            return LocalDatasetModuleFactoryWithoutScript(
+                path,
+                data_dir=data_dir,
+                data_files=data_files,
+                download_mode=download_mode).get_module()
+        # Try remotely
+        elif is_relative_path(path) and path.count('/') <= 1:
+            try:
+                _raise_if_offline_mode_is_enabled()
+
+                try:
+                    dataset_info = HfApi().dataset_info(
+                        repo_id=path,
+                        revision=revision,
+                        token=download_config.token,
+                        timeout=100.0,
+                    )
+                except Exception as e:  # noqa catch any exception of hf_hub and consider that the dataset doesn't exist
+                    if isinstance(
+                            e,
+                        (  # noqa: E131
+                            OfflineModeIsEnabled,  # noqa: E131
+                            requests.exceptions.
+                            ConnectTimeout,  # noqa: E131, E261
+                            requests.exceptions.ConnectionError,  # noqa: E131
+                        ),  # noqa: E131
+                    ):
+                        raise ConnectionError(
+                            f"Couldn't reach '{path}' on the Hub ({type(e).__name__})"
+                        )
+                    elif '404' in str(e):
+                        msg = f"Dataset '{path}' doesn't exist on the Hub"
+                        raise DatasetNotFoundError(
+                            msg
+                            + f" at revision '{revision}'" if revision else msg
+                        )
+                    elif '401' in str(e):
+                        msg = f"Dataset '{path}' doesn't exist on the Hub"
+                        msg = msg + f" at revision '{revision}'" if revision else msg
+                        raise DatasetNotFoundError(
+                            msg + '. If the repo is private or gated, '
+                            'make sure to log in with `huggingface-cli login`.'
+                        )
+                    else:
+                        raise e
+                if filename in [
+                        sibling.rfilename for sibling in dataset_info.siblings
+                ]:  # contains a dataset script
+
+                    # fs = HfFileSystem(
+                    #     endpoint=config.HF_ENDPOINT,
+                    #     token=download_config.token)
+
+                    # TODO
+                    can_load_config_from_parquet_export = False
+                    # if _require_custom_configs:
+                    #     can_load_config_from_parquet_export = False
+                    # elif _require_default_config_name:
+                    #     with fs.open(
+                    #             f'datasets/{path}/{filename}',
+                    #             'r',
+                    #             revision=revision,
+                    #             encoding='utf-8') as f:
+                    #         can_load_config_from_parquet_export = 'DEFAULT_CONFIG_NAME' not in f.read(
+                    #         )
+                    # else:
+                    #     can_load_config_from_parquet_export = True
+                    if config.USE_PARQUET_EXPORT and can_load_config_from_parquet_export:
+                        # If the parquet export is ready (parquet files + info available for the current sha),
+                        # we can use it instead
+                        # This fails when the dataset has multiple configs and a default config and
+                        # the user didn't specify a configuration name (_require_default_config_name=True).
+                        try:
+                            return HubDatasetModuleFactoryWithParquetExport(
+                                path,
+                                download_config=download_config,
+                                revision=dataset_info.sha).get_module()
+                        except Exception as e:
+                            logger.error(e)
+
+                    # Otherwise we must use the dataset script if the user trusts it
+                    return HubDatasetModuleFactoryWithScript(
+                        path,
+                        revision=revision,
+                        download_config=download_config,
+                        download_mode=download_mode,
+                        dynamic_modules_path=dynamic_modules_path,
+                        trust_remote_code=trust_remote_code,
+                    ).get_module()
+                else:
+                    return HubDatasetModuleFactoryWithoutScript(
+                        path,
+                        revision=revision,
+                        data_dir=data_dir,
+                        data_files=data_files,
+                        download_config=download_config,
+                        download_mode=download_mode,
+                    ).get_module()
+            except Exception as e1:
+                # All the attempts failed, before raising the error we should check if the module is already cached
+                logger.error(f'>> Error loading {path}: {e1}')
+                try:
+                    return CachedDatasetModuleFactory(
+                        path,
+                        dynamic_modules_path=dynamic_modules_path,
+                        cache_dir=cache_dir).get_module()
+                except Exception:
+                    # If it's not in the cache, then it doesn't exist.
+                    if isinstance(e1, OfflineModeIsEnabled):
+                        raise ConnectionError(
+                            f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}"
+                        ) from None
+                    if isinstance(e1,
+                                  (DataFilesNotFoundError,
+                                   DatasetNotFoundError, EmptyDatasetError)):
+                        raise e1 from None
+                    if isinstance(e1, FileNotFoundError):
+                        raise FileNotFoundError(
+                            f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or "
+                            f'any data file in the same directory. '
+                            f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}"
+                        ) from None
+                    raise e1 from None
+        else:
+            raise FileNotFoundError(
+                f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or "
+                f'any data file in the same directory.')
+
+
+@contextlib.contextmanager
+def load_dataset_with_ctx(*args, **kwargs):
+
+    # Keep the original functions
+    hf_endpoint_origin = config.HF_ENDPOINT
+    get_from_cache_origin = file_utils.get_from_cache
+
+    # Compatible with datasets 2.18.0
+    _download_origin = DownloadManager._download if hasattr(DownloadManager, '_download') \
+        else DownloadManager._download_single
+
+    dataset_info_origin = HfApi.dataset_info
+    list_repo_tree_origin = HfApi.list_repo_tree
+    get_paths_info_origin = HfApi.get_paths_info
+    resolve_pattern_origin = data_files.resolve_pattern
+    get_module_without_script_origin = HubDatasetModuleFactoryWithoutScript.get_module
+    get_module_with_script_origin = HubDatasetModuleFactoryWithScript.get_module
+
+    # Monkey patching with modelscope functions
+    config.HF_ENDPOINT = get_endpoint()
+    file_utils.get_from_cache = get_from_cache_ms
+    # Compatible with datasets 2.18.0
+    if hasattr(DownloadManager, '_download'):
+        DownloadManager._download = _download_ms
+    else:
+        DownloadManager._download_single = _download_ms
+    HfApi.dataset_info = _dataset_info
+    HfApi.list_repo_tree = _list_repo_tree
+    HfApi.get_paths_info = _get_paths_info
+    data_files.resolve_pattern = _resolve_pattern
+    HubDatasetModuleFactoryWithoutScript.get_module = get_module_without_script
+    HubDatasetModuleFactoryWithScript.get_module = get_module_with_script
+
+    streaming = kwargs.get('streaming', False)
+
+    try:
+        dataset_res = DatasetsWrapperHF.load_dataset(*args, **kwargs)
+        yield dataset_res
+    finally:
+        # Restore the original functions
+        config.HF_ENDPOINT = hf_endpoint_origin
+        file_utils.get_from_cache = get_from_cache_origin
+        # Keep the context during the streaming iteration
+        if not streaming:
+            config.HF_ENDPOINT = hf_endpoint_origin
+            file_utils.get_from_cache = get_from_cache_origin
+
+            # Compatible with datasets 2.18.0
+            if hasattr(DownloadManager, '_download'):
+                DownloadManager._download = _download_origin
+            else:
+                DownloadManager._download_single = _download_origin
+
+            HfApi.dataset_info = dataset_info_origin
+            HfApi.list_repo_tree = list_repo_tree_origin
+            HfApi.get_paths_info = get_paths_info_origin
+            data_files.resolve_pattern = resolve_pattern_origin
+            HubDatasetModuleFactoryWithoutScript.get_module = get_module_without_script_origin
+            HubDatasetModuleFactoryWithScript.get_module = get_module_with_script_origin
diff --git a/modelscope/msdatasets/utils/hf_file_utils.py b/modelscope/msdatasets/utils/hf_file_utils.py
new file mode 100644
index 000000000..863bb1960
--- /dev/null
+++ b/modelscope/msdatasets/utils/hf_file_utils.py
@@ -0,0 +1,346 @@
+# noqa: isort:skip_file, yapf: disable
+# Copyright (c) Alibaba, Inc. and its affiliates.
+# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
+
+import json
+import os
+import re
+import copy
+import shutil
+import time
+import warnings
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+from typing import Optional, Union
+from urllib.parse import urljoin, urlparse
+import requests
+from tqdm import tqdm
+
+from datasets import config
+from datasets.utils.file_utils import hash_url_to_filename, \
+    get_authentication_headers_for_url, fsspec_head, fsspec_get
+from filelock import FileLock
+
+from modelscope.utils.config_ds import MS_DATASETS_CACHE
+from modelscope.utils.logger import get_logger
+from modelscope.hub.api import ModelScopeConfig
+
+from modelscope import __version__
+
+logger = get_logger()
+
+
+def get_datasets_user_agent_ms(user_agent: Optional[Union[str, dict]] = None) -> str:
+    ua = f'datasets/{__version__}'
+    ua += f'; python/{config.PY_VERSION}'
+    ua += f'; pyarrow/{config.PYARROW_VERSION}'
+    if config.TORCH_AVAILABLE:
+        ua += f'; torch/{config.TORCH_VERSION}'
+    if config.TF_AVAILABLE:
+        ua += f'; tensorflow/{config.TF_VERSION}'
+    if config.JAX_AVAILABLE:
+        ua += f'; jax/{config.JAX_VERSION}'
+    if isinstance(user_agent, dict):
+        ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}"
+    elif isinstance(user_agent, str):
+        ua += '; ' + user_agent
+    return ua
+
+
+def _request_with_retry_ms(
+    method: str,
+    url: str,
+    max_retries: int = 2,
+    base_wait_time: float = 0.5,
+    max_wait_time: float = 2,
+    timeout: float = 10.0,
+    **params,
+) -> requests.Response:
+    """Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.
+
+    Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.
+
+    Args:
+        method (str): HTTP method, such as 'GET' or 'HEAD'.
+        url (str): The URL of the resource to fetch.
+        max_retries (int): Maximum number of retries, defaults to 0 (no retries).
+        base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
+            retries then grows exponentially, capped by max_wait_time.
+        max_wait_time (float): Maximum amount of time between two retries, in seconds.
+        **params (additional keyword arguments): Params to pass to :obj:`requests.request`.
+    """
+    tries, success = 0, False
+    response = None
+    while not success:
+        tries += 1
+        try:
+            response = requests.request(method=method.upper(), url=url, timeout=timeout, **params)
+            success = True
+        except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err:
+            if tries > max_retries:
+                raise err
+            else:
+                logger.info(f'{method} request to {url} timed out, retrying... [{tries/max_retries}]')
+                sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1))  # Exponential backoff
+                time.sleep(sleep_time)
+    return response
+
+
+def http_head_ms(
+    url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0
+) -> requests.Response:
+    headers = copy.deepcopy(headers) or {}
+    headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent'))
+    response = _request_with_retry_ms(
+        method='HEAD',
+        url=url,
+        proxies=proxies,
+        headers=headers,
+        cookies=cookies,
+        allow_redirects=allow_redirects,
+        timeout=timeout,
+        max_retries=max_retries,
+    )
+    return response
+
+
+def http_get_ms(
+    url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None
+) -> Optional[requests.Response]:
+    headers = dict(headers) if headers is not None else {}
+    headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent'))
+    if resume_size > 0:
+        headers['Range'] = f'bytes={resume_size:d}-'
+    response = _request_with_retry_ms(
+        method='GET',
+        url=url,
+        stream=True,
+        proxies=proxies,
+        headers=headers,
+        cookies=cookies,
+        max_retries=max_retries,
+        timeout=timeout,
+    )
+    if temp_file is None:
+        return response
+    if response.status_code == 416:  # Range not satisfiable
+        return
+    content_length = response.headers.get('Content-Length')
+    total = resume_size + int(content_length) if content_length is not None else None
+
+    progress = tqdm(total=total, initial=resume_size, unit_scale=True, unit='B', desc=desc or 'Downloading')
+    for chunk in response.iter_content(chunk_size=1024):
+        progress.update(len(chunk))
+        temp_file.write(chunk)
+
+    progress.close()
+
+
+def get_from_cache_ms(
+    url,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    etag_timeout=100,
+    resume_download=False,
+    user_agent=None,
+    local_files_only=False,
+    use_etag=True,
+    max_retries=0,
+    token=None,
+    use_auth_token='deprecated',
+    ignore_url_params=False,
+    storage_options=None,
+    download_desc=None,
+    disable_tqdm=None,
+) -> str:
+    """
+    Given a URL, look for the corresponding file in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+
+    Return:
+        Local path (string)
+
+    Raises:
+        FileNotFoundError: in case of non-recoverable file
+            (non-existent or no cache on disk)
+        ConnectionError: in case of unreachable url
+            and no cache on disk
+    """
+    if use_auth_token != 'deprecated':
+        warnings.warn(
+            "'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n"
+            f"You can remove this warning by passing 'token={use_auth_token}' instead.",
+            FutureWarning,
+        )
+        token = use_auth_token
+    if cache_dir is None:
+        cache_dir = MS_DATASETS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+
+    os.makedirs(cache_dir, exist_ok=True)
+
+    if ignore_url_params:
+        # strip all query parameters and #fragments from the URL
+        cached_url = urljoin(url, urlparse(url).path)
+    else:
+        cached_url = url  # additional parameters may be added to the given URL
+
+    connected = False
+    response = None
+    cookies = None
+    etag = None
+    head_error = None
+    scheme = None
+
+    # Try a first time to file the file on the local file system without eTag (None)
+    # if we don't ask for 'force_download' then we spare a request
+    filename = hash_url_to_filename(cached_url, etag=None)
+    cache_path = os.path.join(cache_dir, filename)
+    if download_desc is None:
+        download_desc = 'Downloading [' + filename + ']'
+
+    if os.path.exists(cache_path) and not force_download and not use_etag:
+        return cache_path
+
+    # Prepare headers for authentication
+    headers = get_authentication_headers_for_url(url, token=token)
+    if user_agent is not None:
+        headers['user-agent'] = user_agent
+
+    # We don't have the file locally or we need an eTag
+    if not local_files_only:
+        scheme = urlparse(url).scheme
+        if scheme not in ('http', 'https'):
+            response = fsspec_head(url, storage_options=storage_options)
+            # s3fs uses "ETag", gcsfs uses "etag"
+            etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None
+            connected = True
+        try:
+            cookies = ModelScopeConfig.get_cookies()
+            response = http_head_ms(
+                url,
+                allow_redirects=True,
+                proxies=proxies,
+                timeout=etag_timeout,
+                max_retries=max_retries,
+                headers=headers,
+                cookies=cookies,
+            )
+            if response.status_code == 200:  # ok
+                etag = response.headers.get('ETag') if use_etag else None
+                for k, v in response.cookies.items():
+                    # In some edge cases, we need to get a confirmation token
+                    if k.startswith('download_warning') and 'drive.google.com' in url:
+                        url += '&confirm=' + v
+                        cookies = response.cookies
+                connected = True
+                # Fix Google Drive URL to avoid Virus scan warning
+                if 'drive.google.com' in url and 'confirm=' not in url:
+                    url += '&confirm=t'
+            # In some edge cases, head request returns 400 but the connection is actually ok
+            elif (
+                (response.status_code == 400 and 'firebasestorage.googleapis.com' in url)
+                or (response.status_code == 405 and 'drive.google.com' in url)
+                or (
+                    response.status_code == 403
+                    and (
+                        re.match(r'^https?://github.com/.*?/.*?/releases/download/.*?/.*?$', url)
+                        or re.match(r'^https://.*?s3.*?amazonaws.com/.*?$', response.url)
+                    )
+                )
+                or (response.status_code == 403 and 'ndownloader.figstatic.com' in url)
+            ):
+                connected = True
+                logger.info(f"Couldn't get ETag version for url {url}")
+            elif response.status_code == 401 and config.HF_ENDPOINT in url and token is None:
+                raise ConnectionError(
+                    f'Unauthorized for URL {url}. '
+                    f'Please use the parameter `token=True` after logging in with `huggingface-cli login`'
+                )
+        except (OSError, requests.exceptions.Timeout) as e:
+            # not connected
+            head_error = e
+            pass
+
+    # connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
+    # try to get the last downloaded one
+    if not connected:
+        if os.path.exists(cache_path) and not force_download:
+            return cache_path
+        if local_files_only:
+            raise FileNotFoundError(
+                f'Cannot find the requested files in the cached path at {cache_path} and outgoing traffic has been'
+                " disabled. To enable file online look-ups, set 'local_files_only' to False."
+            )
+        elif response is not None and response.status_code == 404:
+            raise FileNotFoundError(f"Couldn't find file at {url}")
+        if head_error is not None:
+            raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})")
+        elif response is not None:
+            raise ConnectionError(f"Couldn't reach {url} (error {response.status_code})")
+        else:
+            raise ConnectionError(f"Couldn't reach {url}")
+
+    # Try a second time
+    filename = hash_url_to_filename(cached_url, etag)
+    cache_path = os.path.join(cache_dir, filename)
+
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+
+    # From now on, connected is True.
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + '.lock'
+    with FileLock(lock_path):
+        # Retry in case previously locked processes just enter after the precedent process releases the lock
+        if os.path.exists(cache_path) and not force_download:
+            return cache_path
+
+        incomplete_path = cache_path + '.incomplete'
+
+        @contextmanager
+        def temp_file_manager(mode='w+b'):
+            with open(incomplete_path, mode) as f:
+                yield f
+
+        resume_size = 0
+        if resume_download:
+            temp_file_manager = partial(temp_file_manager, mode='a+b')
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+
+        # Download to temporary file, then copy to cache path once finished.
+        # Otherwise, you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+
+            # GET file object
+            if scheme not in ('http', 'https'):
+                fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc)
+            else:
+                http_get_ms(
+                    url,
+                    temp_file=temp_file,
+                    proxies=proxies,
+                    resume_size=resume_size,
+                    headers=headers,
+                    cookies=cookies,
+                    max_retries=max_retries,
+                    desc=download_desc,
+                )
+
+        logger.info(f'storing {url} in cache at {cache_path}')
+        shutil.move(temp_file.name, cache_path)
+        umask = os.umask(0o666)
+        os.umask(umask)
+        os.chmod(cache_path, 0o666 & ~umask)
+
+        logger.info(f'creating metadata file for {cache_path}')
+        meta = {'url': url, 'etag': etag}
+        meta_path = cache_path + '.json'
+        with open(meta_path, 'w', encoding='utf-8') as meta_file:
+            json.dump(meta, meta_file)
+
+    return cache_path
diff --git a/modelscope/outputs/nlp_outputs.py b/modelscope/outputs/nlp_outputs.py
index ed42cb5a8..747f5bd3d 100644
--- a/modelscope/outputs/nlp_outputs.py
+++ b/modelscope/outputs/nlp_outputs.py
@@ -326,7 +326,7 @@ class TextErrorCorrectionOutput(ModelOutputBase):
     """The output class for information extraction models.
     """
 
-    predictions: np.ndarray = None
+    predictions: List = None
 
 
 @dataclass
diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index a32fc157d..4db9c0bac 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -25,6 +25,10 @@ class OutputKeys(object):
     MASKS = 'masks'
     DEPTHS = 'depths'
     DEPTHS_COLOR = 'depths_color'
+    FLOWS = 'flows'
+    FLOWS_COLOR = 'flows_color'
+    NORMALS = 'normals'
+    NORMALS_COLOR = 'normals_color'
     LAYOUT = 'layout'
     TEXT = 'text'
     POLYGONS = 'polygons'
@@ -69,6 +73,7 @@ class OutputKeys(object):
     PCD12 = 'pcd12'
     PCD12_ALIGN = 'pcd12_align'
     TBOUNDS = 'tbounds'
+    MV_IMGS = 'MViews'
 
 
 OutputTypes = {
@@ -132,6 +137,7 @@ class OutputKeys(object):
     OutputKeys.PCD12: np.ndarray,
     OutputKeys.PCD12_ALIGN: np.ndarray,
     OutputKeys.TBOUNDS: Dict,
+    OutputKeys.MV_IMGS: List[np.ndarray],
 }
 
 OutputTypeSchema = {
@@ -426,6 +432,15 @@ class OutputKeys(object):
     OutputKeys.TBOUNDS: {
         'type': 'object'
     },
+    OutputKeys.MV_IMGS: {
+        'type': 'array',
+        'items': {
+            'type': 'array',
+            'items': {
+                'type': 'number'
+            }
+        }
+    },
 }
 
 TASK_OUTPUTS = {
@@ -761,6 +776,7 @@ class OutputKeys(object):
     Tasks.surface_recon_common: [OutputKeys.OUTPUT],
     Tasks.video_colorization: [OutputKeys.OUTPUT_VIDEO],
     Tasks.image_control_3d_portrait: [OutputKeys.OUTPUT],
+    Tasks.self_supervised_depth_completion: [OutputKeys.OUTPUT_IMG],
 
     # image quality assessment degradation result for single image
     # {
@@ -1632,6 +1648,8 @@ class OutputKeys(object):
     #    "output_imgs": np.ndarray list with shape [[height, width, 3], ...]
     # }
     Tasks.image_view_transform: [OutputKeys.OUTPUT_IMGS],
+    Tasks.image_to_3d: [OutputKeys.MV_IMGS],
+    Tasks.siamese_uie: [OutputKeys.OUTPUT],
 }
 
 
diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py
index 6e6443765..f281d0e70 100644
--- a/modelscope/pipeline_inputs.py
+++ b/modelscope/pipeline_inputs.py
@@ -247,8 +247,10 @@ def check_input_type(input_type, input):
     InputType.VIDEO,
 
     # image generation task result for a single image
-    Tasks.image_to_image_generation:
-    InputType.IMAGE,
+    Tasks.image_to_image_generation: [
+        InputType.IMAGE,
+        (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE, InputType.IMAGE)
+    ],
     Tasks.image_to_image_translation:
     InputType.IMAGE,
     Tasks.image_style_transfer: {
@@ -436,6 +438,8 @@ def check_input_type(input_type, input):
     Tasks.machine_reading_comprehension:
     InputType.TEXT,
 
+    Tasks.siamese_uie: InputType.TEXT,
+
     # ============ audio tasks ===================
     Tasks.auto_speech_recognition:  # input can be audio, or audio and text.
     [InputType.AUDIO, {
diff --git a/modelscope/pipelines/accelerate/vllm.py b/modelscope/pipelines/accelerate/vllm.py
index 5c11c29b0..15ced4bb6 100644
--- a/modelscope/pipelines/accelerate/vllm.py
+++ b/modelscope/pipelines/accelerate/vllm.py
@@ -42,6 +42,24 @@ def __call__(self, prompts: Union[List[str], List[List[int]]],
                 The string batch or the token list batch to input to the model.
             kwargs: Sampling parameters.
         """
+
+        # convert hf generate config to vllm
+        do_sample = kwargs.pop('do_sample', None)
+        num_beam = kwargs.pop('num_beam', 1)
+        max_length = kwargs.pop('max_length', None)
+        max_new_tokens = kwargs.pop('max_new_tokens', None)
+
+        # for vllm, default to do_sample/greedy(depends on temperature).
+        # for hf, do_sample=false, num_beam=1 -> greedy(default)
+        #         do_sample=ture, num_beam=1 -> sample
+        #         do_sample=false, num_beam>1 -> beam_search
+        if not do_sample and num_beam > 1:
+            kwargs['use_beam_search'] = True
+        if max_length:
+            kwargs['max_tokens'] = max_length - len(prompts[0])
+        if max_new_tokens:
+            kwargs['max_tokens'] = max_new_tokens
+
         from vllm import SamplingParams
         sampling_params = SamplingParams(**kwargs)
         if isinstance(prompts[0], str):
diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py
index 3719689c9..60a63722d 100644
--- a/modelscope/pipelines/audio/ans_pipeline.py
+++ b/modelscope/pipelines/audio/ans_pipeline.py
@@ -122,3 +122,126 @@ def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
                 np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
                 self.SAMPLE_RATE)
         return inputs
+
+
+@PIPELINES.register_module(
+    Tasks.acoustic_noise_suppression,
+    module_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
+class ANSZipEnhancerPipeline(Pipeline):
+    r"""ANS (Acoustic Noise Suppression) Inference Pipeline .
+
+    When invoke the class with pipeline.__call__(), it accept only one parameter:
+        inputs(str): the path of wav file
+    """
+    SAMPLE_RATE = 16000
+
+    def __init__(self, model, **kwargs):
+        """
+        use `model` and `preprocessor` to create a kws pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        self.model.eval()
+        self.stream_mode = kwargs.get('stream_mode', False)
+
+    def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]:
+        if self.stream_mode:
+            raise TypeError('This model does not support stream mode!')
+        if isinstance(inputs, bytes):
+            data1, fs = sf.read(io.BytesIO(inputs))
+        elif isinstance(inputs, str):
+            file_bytes = File.read(inputs)
+            data1, fs = sf.read(io.BytesIO(file_bytes))
+        else:
+            raise TypeError(f'Unsupported type {type(inputs)}.')
+        if len(data1.shape) > 1:
+            data1 = data1[:, 0]
+        if fs != self.SAMPLE_RATE:
+            data1 = librosa.resample(
+                data1, orig_sr=fs, target_sr=self.SAMPLE_RATE)
+        data1 = audio_norm(data1)
+        data = data1.astype(np.float32)
+        inputs = np.reshape(data, [1, data.shape[0]])
+        return {'ndarray': inputs, 'nsamples': data.shape[0]}
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        ndarray = inputs['ndarray']
+        if isinstance(ndarray, torch.Tensor):
+            ndarray = ndarray.cpu().numpy()
+        nsamples = inputs['nsamples']
+        decode_do_segement = False
+        window = 16000 * 2  # 2s
+        stride = int(window * 0.75)
+        print('inputs:{}'.format(ndarray.shape))
+        b, t = ndarray.shape  # size()
+        if t > window * 3:  # 6s
+            decode_do_segement = True
+            print('decode_do_segement')
+
+        if t < window:
+            ndarray = np.concatenate(
+                [ndarray, np.zeros((ndarray.shape[0], window - t))], 1)
+        elif decode_do_segement:
+            if t < window + stride:
+                padding = window + stride - t
+                print('padding: {}'.format(padding))
+                ndarray = np.concatenate(
+                    [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
+            else:
+                if (t - window) % stride != 0:
+                    # padding = t - (t - window) // stride * stride
+                    padding = (
+                        (t - window) // stride + 1) * stride + window - t
+                    print('padding: {}'.format(padding))
+                    ndarray = np.concatenate(
+                        [ndarray,
+                         np.zeros((ndarray.shape[0], padding))], 1)
+        # else:
+        #     if (t - window) % stride != 0:
+        #         padding = t - (t - window) // stride * stride
+        #         print('padding: {}'.format(padding))
+        #         ndarray = np.concatenate(
+        #             [ndarray, np.zeros((ndarray.shape[0], padding))], 1)
+        print('inputs after padding:{}'.format(ndarray.shape))
+        with torch.no_grad():
+            ndarray = torch.from_numpy(np.float32(ndarray)).to(self.device)
+            b, t = ndarray.shape
+            if decode_do_segement:
+                outputs = np.zeros(t)
+                give_up_length = (window - stride) // 2
+                current_idx = 0
+                while current_idx + window <= t:
+                    # print('current_idx: {}'.format(current_idx))
+                    print(
+                        '\rcurrent_idx: {} {:.2f}%'.format(
+                            current_idx, current_idx * 100 / t),
+                        end='')
+                    tmp_input = dict(noisy=ndarray[:, current_idx:current_idx
+                                                   + window])
+                    tmp_output = self.model(
+                        tmp_input, )['wav_l2'][0].cpu().numpy()
+                    end_index = current_idx + window - give_up_length
+                    if current_idx == 0:
+                        outputs[current_idx:
+                                end_index] = tmp_output[:-give_up_length]
+                    else:
+                        outputs[current_idx
+                                + give_up_length:end_index] = tmp_output[
+                                    give_up_length:-give_up_length]
+                    current_idx += stride
+                print('\rcurrent_idx: {} {:.2f}%'.format(current_idx, 100))
+            else:
+                outputs = self.model(
+                    dict(noisy=ndarray))['wav_l2'][0].cpu().numpy()
+        outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes()
+        return {OutputKeys.OUTPUT_PCM: outputs}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        if 'output_path' in kwargs.keys():
+            sf.write(
+                kwargs['output_path'],
+                np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16),
+                self.SAMPLE_RATE)
+        return inputs
diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py
deleted file mode 100644
index f825412c0..000000000
--- a/modelscope/pipelines/audio/asr_inference_pipeline.py
+++ /dev/null
@@ -1,591 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
-
-import json
-import yaml
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.preprocessors import WavToScp
-from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav,
-                                                generate_scp_from_url,
-                                                load_bytes_from_url,
-                                                update_local_model)
-from modelscope.utils.constant import Frameworks, ModelFile, Tasks
-from modelscope.utils.hub import snapshot_download
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['AutomaticSpeechRecognitionPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.auto_speech_recognition, module_name=Pipelines.asr_inference)
-class AutomaticSpeechRecognitionPipeline(Pipeline):
-    """ASR Inference Pipeline
-    Example:
-
-    >>> from modelscope.pipelines import pipeline
-    >>> from modelscope.utils.constant import Tasks
-
-    >>> inference_pipeline = pipeline(
-    >>>     task=Tasks.auto_speech_recognition,
-    >>>     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
-
-    >>> rec_result = inference_pipeline(
-    >>>     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
-    >>> print(rec_result)
-
-    """
-
-    def __init__(self,
-                 model: Union[Model, str] = None,
-                 preprocessor: WavToScp = None,
-                 vad_model: Optional[Union[Model, str]] = None,
-                 vad_model_revision: Optional[str] = None,
-                 punc_model: Optional[Union[Model, str]] = None,
-                 punc_model_revision: Optional[str] = None,
-                 lm_model: Optional[Union[Model, str]] = None,
-                 lm_model_revision: Optional[str] = None,
-                 timestamp_model: Optional[Union[Model, str]] = None,
-                 timestamp_model_revision: Optional[str] = None,
-                 ngpu: int = 1,
-                 **kwargs):
-        """
-        Use `model` and `preprocessor` to create an asr pipeline for prediction
-        Args:
-            model ('Model' or 'str'):
-                The pipeline handles three types of model:
-
-                - A model instance
-                - A model local dir
-                - A model id in the model hub
-            preprocessor:
-                (list of) Preprocessor object
-            vad_model (Optional: 'Model' or 'str'):
-                voice activity detection model from model hub or local
-                example: 'damo/speech_fsmn_vad_zh-cn-16k-common-pytorch'
-            punc_model (Optional: 'Model' or 'str'):
-                punctuation model from model hub or local
-                example: 'damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch'
-            lm_model (Optional: 'Model' or 'str'):
-                language model from model hub or local
-                example: 'damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch'
-            timestamp_model (Optional: 'Model' or 'str'):
-                timestamp model from model hub or local
-                example: 'damo/speech_timestamp_predictor-v1-16k-offline'
-            output_dir('str'):
-                output dir path
-            batch_size('int'):
-                the batch size for inference
-            ngpu('int'):
-                the number of gpus, 0 indicates CPU mode
-            beam_size('int'):
-                beam size for decoding
-            ctc_weight('float'):
-                the CTC weight in joint decoding
-            lm_weight('float'):
-                lm weight
-            decoding_ind('int', defaults to 0):
-                decoding ind
-            decoding_mode('str', defaults to 'model1'):
-                decoding mode
-            vad_model_file('str'):
-                vad model file
-            vad_infer_config('str'):
-                VAD infer configuration
-            vad_cmvn_file('str'):
-                global CMVN file
-            punc_model_file('str'):
-                punc model file
-            punc_infer_config('str'):
-                punc infer config
-            param_dict('dict'):
-                extra kwargs
-        """
-        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
-        self.vad_model = vad_model
-        self.vad_model_revision = vad_model_revision
-        self.punc_model = punc_model
-        self.punc_model_revision = punc_model_revision
-        self.lm_model = lm_model
-        self.lm_model_revision = lm_model_revision
-        self.timestamp_model = timestamp_model
-        self.timestamp_model_revision = timestamp_model_revision
-        self.model_cfg = self.model.forward()
-
-        self.cmd = self.get_cmd(kwargs, model)
-        from funasr.bin import asr_inference_launch
-        self.funasr_infer_modelscope = asr_inference_launch.inference_launch(
-            mode=self.cmd['mode'],
-            maxlenratio=self.cmd['maxlenratio'],
-            minlenratio=self.cmd['minlenratio'],
-            batch_size=self.cmd['batch_size'],
-            beam_size=self.cmd['beam_size'],
-            ngpu=ngpu,
-            ctc_weight=self.cmd['ctc_weight'],
-            lm_weight=self.cmd['lm_weight'],
-            penalty=self.cmd['penalty'],
-            log_level=self.cmd['log_level'],
-            asr_train_config=self.cmd['asr_train_config'],
-            asr_model_file=self.cmd['asr_model_file'],
-            cmvn_file=self.cmd['cmvn_file'],
-            lm_file=self.cmd['lm_file'],
-            token_type=self.cmd['token_type'],
-            key_file=self.cmd['key_file'],
-            lm_train_config=self.cmd['lm_train_config'],
-            bpemodel=self.cmd['bpemodel'],
-            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
-            output_dir=self.cmd['output_dir'],
-            dtype=self.cmd['dtype'],
-            seed=self.cmd['seed'],
-            ngram_weight=self.cmd['ngram_weight'],
-            nbest=self.cmd['nbest'],
-            num_workers=self.cmd['num_workers'],
-            vad_infer_config=self.cmd['vad_infer_config'],
-            vad_model_file=self.cmd['vad_model_file'],
-            vad_cmvn_file=self.cmd['vad_cmvn_file'],
-            punc_model_file=self.cmd['punc_model_file'],
-            punc_infer_config=self.cmd['punc_infer_config'],
-            timestamp_model_file=self.cmd['timestamp_model_file'],
-            timestamp_infer_config=self.cmd['timestamp_infer_config'],
-            timestamp_cmvn_file=self.cmd['timestamp_cmvn_file'],
-            outputs_dict=self.cmd['outputs_dict'],
-            param_dict=self.cmd['param_dict'],
-            token_num_relax=self.cmd['token_num_relax'],
-            decoding_ind=self.cmd['decoding_ind'],
-            decoding_mode=self.cmd['decoding_mode'],
-            fake_streaming=self.cmd['fake_streaming'],
-            model_lang=self.cmd['model_lang'],
-            **kwargs,
-        )
-
-    def __call__(self,
-                 audio_in: Union[str, bytes],
-                 audio_fs: int = None,
-                 recog_type: str = None,
-                 audio_format: str = None,
-                 output_dir: str = None,
-                 param_dict: dict = None,
-                 **kwargs) -> Dict[str, Any]:
-        from funasr.utils import asr_utils
-        """
-        Decoding the input audios
-        Args:
-            audio_in('str' or 'bytes'):
-                - A string containing a local path to a wav file
-                - A string containing a local path to a scp
-                - A string containing a wav url
-                - A bytes input
-            audio_fs('int'):
-                frequency of sample
-            recog_type('str'):
-                recog type
-            audio_format('str'):
-                audio format
-            output_dir('str'):
-                output dir
-            param_dict('dict'):
-                extra kwargs
-        Return:
-            A dictionary of result or a list of dictionary of result.
-
-            The dictionary contain the following keys:
-            - **text** ('str') --The asr result.
-        """
-
-        # code base
-        # code_base = self.cmd['code_base']
-        self.recog_type = recog_type
-        self.audio_format = audio_format
-        self.audio_fs = None
-        checking_audio_fs = None
-        self.raw_inputs = None
-        if output_dir is not None:
-            self.cmd['output_dir'] = output_dir
-        self.cmd['param_dict'] = param_dict
-
-        if isinstance(audio_in, str):
-            # for funasr code, generate wav.scp from url or local path
-            if audio_in.startswith('http') or os.path.isfile(audio_in):
-                self.audio_in, self.raw_inputs = generate_scp_from_url(
-                    audio_in)
-            else:
-                raise FileNotFoundError(
-                    f'file {audio_in} NOT FOUND, please CHECK!')
-        elif isinstance(audio_in, bytes):
-            self.audio_in = audio_in
-            self.raw_inputs = None
-        else:
-            import numpy
-            import torch
-            if isinstance(audio_in, torch.Tensor):
-                self.audio_in = None
-                self.raw_inputs = audio_in
-            elif isinstance(audio_in, numpy.ndarray):
-                self.audio_in = None
-                self.raw_inputs = audio_in
-
-        # set the sample_rate of audio_in if checking_audio_fs is valid
-        if checking_audio_fs is not None:
-            self.audio_fs = checking_audio_fs
-
-        if recog_type is None or audio_format is None:
-            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=self.audio_in,
-                recog_type=recog_type,
-                audio_format=audio_format)
-
-        if hasattr(asr_utils,
-                   'sample_rate_checking') and self.audio_in is not None:
-            checking_audio_fs = asr_utils.sample_rate_checking(
-                self.audio_in, self.audio_format)
-            if checking_audio_fs is not None:
-                self.audio_fs = checking_audio_fs
-        if audio_fs is not None:
-            self.cmd['fs']['audio_fs'] = audio_fs
-        else:
-            self.cmd['fs']['audio_fs'] = self.audio_fs
-
-        output = self.preprocessor.forward(self.model_cfg, self.recog_type,
-                                           self.audio_format, self.audio_in,
-                                           self.audio_fs, self.cmd)
-        output = self.forward(output, **kwargs)
-        rst = self.postprocess(output)
-        return rst
-
-    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
-        if self.preprocessor is None:
-            self.preprocessor = WavToScp()
-
-        outputs = self.preprocessor.config_checking(self.model_cfg)
-        # generate asr inference command
-        cmd = {
-            'maxlenratio': 0.0,
-            'minlenratio': 0.0,
-            'batch_size': 1,
-            'beam_size': 1,
-            'ngpu': 1,
-            'ctc_weight': 0.0,
-            'lm_weight': 0.0,
-            'penalty': 0.0,
-            'log_level': 'ERROR',
-            'asr_train_config': None,
-            'asr_model_file': outputs['am_model_path'],
-            'cmvn_file': None,
-            'lm_train_config': None,
-            'lm_file': None,
-            'token_type': None,
-            'key_file': None,
-            'word_lm_train_config': None,
-            'bpemodel': None,
-            'allow_variable_data_keys': False,
-            'output_dir': None,
-            'dtype': 'float32',
-            'seed': 0,
-            'ngram_weight': 0.9,
-            'nbest': 1,
-            'num_workers': 0,
-            'vad_infer_config': None,
-            'vad_model_file': None,
-            'vad_cmvn_file': None,
-            'time_stamp_writer': True,
-            'punc_infer_config': None,
-            'punc_model_file': None,
-            'timestamp_infer_config': None,
-            'timestamp_model_file': None,
-            'timestamp_cmvn_file': None,
-            'outputs_dict': True,
-            'param_dict': None,
-            'model_type': outputs['model_type'],
-            'idx_text': '',
-            'sampled_ids': 'seq2seq/sampled_ids',
-            'sampled_lengths': 'seq2seq/sampled_lengths',
-            'model_lang': outputs['model_lang'],
-            'code_base': outputs['code_base'],
-            'mode': outputs['mode'],
-            'fs': {
-                'model_fs': None,
-                'audio_fs': None
-            },
-            'fake_streaming': False,
-        }
-
-        frontend_conf = None
-        token_num_relax = None
-        decoding_ind = None
-        decoding_mode = None
-        fake_streaming = False
-        if os.path.exists(outputs['am_model_config']):
-            config_file = open(outputs['am_model_config'], encoding='utf-8')
-            root = yaml.full_load(config_file)
-            config_file.close()
-            if 'frontend_conf' in root:
-                frontend_conf = root['frontend_conf']
-        if os.path.exists(outputs['asr_model_config']):
-            config_file = open(outputs['asr_model_config'], encoding='utf-8')
-            root = yaml.full_load(config_file)
-            config_file.close()
-            if 'token_num_relax' in root:
-                token_num_relax = root['token_num_relax']
-            if 'decoding_ind' in root:
-                decoding_ind = root['decoding_ind']
-            if 'decoding_mode' in root:
-                decoding_mode = root['decoding_mode']
-
-            cmd['beam_size'] = root['beam_size']
-            cmd['penalty'] = root['penalty']
-            cmd['maxlenratio'] = root['maxlenratio']
-            cmd['minlenratio'] = root['minlenratio']
-            cmd['ctc_weight'] = root['ctc_weight']
-            cmd['lm_weight'] = root['lm_weight']
-        cmd['asr_train_config'] = outputs['am_model_config']
-        cmd['lm_file'] = outputs['lm_model_path']
-        cmd['lm_train_config'] = outputs['lm_model_config']
-        cmd['batch_size'] = outputs['model_config']['batch_size']
-        cmd['frontend_conf'] = frontend_conf
-        if frontend_conf is not None and 'fs' in frontend_conf:
-            cmd['fs']['model_fs'] = frontend_conf['fs']
-        cmd['token_num_relax'] = token_num_relax
-        cmd['decoding_ind'] = decoding_ind
-        cmd['decoding_mode'] = decoding_mode
-        cmd['fake_streaming'] = fake_streaming
-        if outputs.__contains__('mvn_file'):
-            cmd['cmvn_file'] = outputs['mvn_file']
-        model_config = self.model_cfg['model_config']
-        if model_config.__contains__('vad_model') and self.vad_model is None:
-            self.vad_model = model_config['vad_model']
-        if model_config.__contains__('vad_model_revision'):
-            self.vad_model_revision = model_config['vad_model_revision']
-        if model_config.__contains__('punc_model') and self.punc_model is None:
-            self.punc_model = model_config['punc_model']
-        if model_config.__contains__('punc_model_revision'):
-            self.punc_model_revision = model_config['punc_model_revision']
-        if model_config.__contains__(
-                'timestamp_model') and self.timestamp_model is None:
-            self.timestamp_model = model_config['timestamp_model']
-        if model_config.__contains__('timestamp_model_revision'):
-            self.timestamp_model_revision = model_config[
-                'timestamp_model_revision']
-        update_local_model(model_config, model_path, extra_args)
-        self.load_vad_model(cmd)
-        self.load_punc_model(cmd)
-        self.load_lm_model(cmd)
-        self.load_timestamp_model(cmd)
-
-        user_args_dict = [
-            'output_dir',
-            'batch_size',
-            'mode',
-            'ngpu',
-            'beam_size',
-            'ctc_weight',
-            'lm_weight',
-            'decoding_ind',
-            'decoding_mode',
-            'vad_model_file',
-            'vad_infer_config',
-            'vad_cmvn_file',
-            'punc_model_file',
-            'punc_infer_config',
-            'param_dict',
-            'fake_streaming',
-        ]
-
-        for user_args in user_args_dict:
-            if user_args in extra_args:
-                if extra_args.get(user_args) is not None:
-                    cmd[user_args] = extra_args[user_args]
-                del extra_args[user_args]
-
-        return cmd
-
-    def load_vad_model(self, cmd):
-        if self.vad_model is not None and self.vad_model != '':
-            if os.path.exists(self.vad_model):
-                vad_model = self.vad_model
-            else:
-                vad_model = snapshot_download(
-                    self.vad_model, revision=self.vad_model_revision)
-            logger.info('loading vad model from {0} ...'.format(vad_model))
-            config_path = os.path.join(vad_model, ModelFile.CONFIGURATION)
-            model_cfg = json.loads(open(config_path).read())
-            model_dir = os.path.dirname(config_path)
-            cmd['vad_model_file'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['vad_model_name'])
-            cmd['vad_infer_config'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['vad_model_config'])
-            cmd['vad_cmvn_file'] = os.path.join(
-                model_dir, model_cfg['model']['model_config']['vad_mvn_file'])
-            if 'vad' not in cmd['mode']:
-                cmd['mode'] = cmd['mode'] + '_vad'
-
-    def load_punc_model(self, cmd):
-        if self.punc_model is not None and self.punc_model != '':
-            if os.path.exists(self.punc_model):
-                punc_model = self.punc_model
-            else:
-                punc_model = snapshot_download(
-                    self.punc_model, revision=self.punc_model_revision)
-            logger.info(
-                'loading punctuation model from {0} ...'.format(punc_model))
-            config_path = os.path.join(punc_model, ModelFile.CONFIGURATION)
-            model_cfg = json.loads(open(config_path).read())
-            model_dir = os.path.dirname(config_path)
-            cmd['punc_model_file'] = os.path.join(
-                model_dir, model_cfg['model']['punc_model_name'])
-            cmd['punc_infer_config'] = os.path.join(
-                model_dir,
-                model_cfg['model']['punc_model_config']['punc_config'])
-            if 'punc' not in cmd['mode']:
-                cmd['mode'] = cmd['mode'] + '_punc'
-
-    def load_lm_model(self, cmd):
-        if self.lm_model is not None and self.lm_model != '':
-            if os.path.exists(self.lm_model):
-                lm_model = self.lm_model
-            else:
-                lm_model = snapshot_download(
-                    self.lm_model, revision=self.lm_model_revision)
-            logger.info('loading language model from {0} ...'.format(lm_model))
-            config_path = os.path.join(lm_model, ModelFile.CONFIGURATION)
-            model_cfg = json.loads(open(config_path).read())
-            model_dir = os.path.dirname(config_path)
-            cmd['lm_file'] = os.path.join(
-                model_dir, model_cfg['model']['model_config']['lm_model_name'])
-            cmd['lm_train_config'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['lm_model_config'])
-
-    # FIXME
-    def load_timestamp_model(self, cmd):
-        if self.timestamp_model is not None and self.timestamp_model != '':
-            if os.path.exists(self.timestamp_model):
-                timestamp_model = self.timestamp_model
-            else:
-                timestamp_model = snapshot_download(
-                    self.timestamp_model,
-                    revision=self.timestamp_model_revision)
-            logger.info(
-                'loading timestamp model from {0} ...'.format(timestamp_model))
-            config_path = os.path.join(timestamp_model,
-                                       ModelFile.CONFIGURATION)
-            model_cfg = json.loads(open(config_path).read())
-            model_dir = os.path.dirname(config_path)
-            cmd['timestamp_model_file'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['timestamp_model_file'])
-            cmd['timestamp_infer_config'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['timestamp_infer_config'])
-            cmd['timestamp_cmvn_file'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['timestamp_cmvn_file'])
-
-    def forward(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        """Decoding
-        """
-
-        logger.info(f"Decoding with {inputs['audio_format']} files ...")
-
-        data_cmd: Sequence[Tuple[str, str, str]]
-        if isinstance(self.audio_in, bytes):
-            data_cmd = [self.audio_in, 'speech', 'bytes']
-        elif isinstance(self.audio_in, str):
-            data_cmd = [self.audio_in, 'speech', 'sound']
-        elif self.raw_inputs is not None:
-            data_cmd = None
-
-        # generate asr inference command
-        self.cmd['name_and_type'] = data_cmd
-        self.cmd['raw_inputs'] = self.raw_inputs
-        self.cmd['audio_in'] = self.audio_in
-
-        inputs['asr_result'] = self.run_inference(self.cmd, **kwargs)
-
-        return inputs
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        """process the asr results
-        """
-        from funasr.utils import asr_utils
-
-        logger.info('Computing the result of ASR ...')
-
-        rst = {}
-
-        # single wav or pcm task
-        if inputs['recog_type'] == 'wav':
-            if 'asr_result' in inputs and len(inputs['asr_result']) > 0:
-                for key, value in inputs['asr_result'][0].items():
-                    if key == 'value':
-                        if len(value) > 0:
-                            rst[OutputKeys.TEXT] = value
-                    elif key != 'key':
-                        rst[key] = value
-
-        # run with datasets, and audio format is waveform or kaldi_ark or tfrecord
-        elif inputs['recog_type'] != 'wav':
-            inputs['reference_list'] = self.ref_list_tidy(inputs)
-
-            inputs['datasets_result'] = asr_utils.compute_wer(
-                hyp_list=inputs['asr_result'],
-                ref_list=inputs['reference_list'])
-
-        else:
-            raise ValueError('recog_type and audio_format are mismatching')
-
-        if 'datasets_result' in inputs:
-            rst[OutputKeys.TEXT] = inputs['datasets_result']
-
-        return rst
-
-    def ref_list_tidy(self, inputs: Dict[str, Any]) -> List[Any]:
-        ref_list = []
-
-        if inputs['audio_format'] == 'tfrecord':
-            # should assemble idx + txt
-            with open(inputs['reference_text'], 'r', encoding='utf-8') as r:
-                text_lines = r.readlines()
-
-            with open(inputs['idx_text'], 'r', encoding='utf-8') as i:
-                idx_lines = i.readlines()
-
-            j: int = 0
-            while j < min(len(text_lines), len(idx_lines)):
-                idx_str = idx_lines[j].strip()
-                text_str = text_lines[j].strip().replace(' ', '')
-                item = {'key': idx_str, 'value': text_str}
-                ref_list.append(item)
-                j += 1
-
-        else:
-            # text contain idx + sentence
-            with open(inputs['reference_text'], 'r', encoding='utf-8') as f:
-                lines = f.readlines()
-
-            for line in lines:
-                line_item = line.split(None, 1)
-                if len(line_item) > 1:
-                    item = {
-                        'key': line_item[0],
-                        'value': line_item[1].strip('\n')
-                    }
-                    ref_list.append(item)
-
-        return ref_list
-
-    def run_inference(self, cmd, **kwargs):
-        asr_result = self.funasr_infer_modelscope(cmd['name_and_type'],
-                                                  cmd['raw_inputs'],
-                                                  cmd['output_dir'], cmd['fs'],
-                                                  cmd['param_dict'], **kwargs)
-
-        return asr_result
diff --git a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
index 9e0eb7f5c..f80dbf4cd 100644
--- a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
+++ b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py
@@ -35,7 +35,7 @@ def __call__(self,
                  audio_fs: int = None,
                  recog_type: str = None,
                  audio_format: str = None) -> Dict[str, Any]:
-        from funasr.utils import asr_utils
+        # from funasr.utils import asr_utils
 
         self.recog_type = recog_type
         self.audio_format = audio_format
@@ -54,17 +54,17 @@ def __call__(self,
         if checking_audio_fs is not None:
             self.audio_fs = checking_audio_fs
 
-        if recog_type is None or audio_format is None:
-            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=self.audio_in,
-                recog_type=recog_type,
-                audio_format=audio_format)
+        # if recog_type is None or audio_format is None:
+        #     self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
+        #         audio_in=self.audio_in,
+        #         recog_type=recog_type,
+        #         audio_format=audio_format)
 
-        if hasattr(asr_utils, 'sample_rate_checking'):
-            checking_audio_fs = asr_utils.sample_rate_checking(
-                self.audio_in, self.audio_format)
-            if checking_audio_fs is not None:
-                self.audio_fs = checking_audio_fs
+        # if hasattr(asr_utils, 'sample_rate_checking'):
+        #     checking_audio_fs = asr_utils.sample_rate_checking(
+        #         self.audio_in, self.audio_format)
+        #     if checking_audio_fs is not None:
+        #         self.audio_fs = checking_audio_fs
 
         inputs = {
             'audio': self.audio_in,
diff --git a/modelscope/pipelines/audio/speaker_verification_pipeline.py b/modelscope/pipelines/audio/audio_quantization_pipeline.py
similarity index 52%
rename from modelscope/pipelines/audio/speaker_verification_pipeline.py
rename to modelscope/pipelines/audio/audio_quantization_pipeline.py
index c23058be4..76115db5f 100644
--- a/modelscope/pipelines/audio/speaker_verification_pipeline.py
+++ b/modelscope/pipelines/audio/audio_quantization_pipeline.py
@@ -3,6 +3,7 @@
 import shutil
 from typing import Any, Dict, List, Sequence, Tuple, Union
 
+import numpy as np
 import yaml
 
 from modelscope.metainfo import Pipelines
@@ -10,34 +11,36 @@
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.audio.audio_utils import (generate_scp_for_sv,
-                                                generate_sv_scp_from_url,
+from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
                                                 update_local_model)
 from modelscope.utils.constant import Frameworks, Tasks
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 
-__all__ = ['SpeakerVerificationPipeline']
+__all__ = ['AudioQuantizationPipeline']
 
 
 @PIPELINES.register_module(
-    Tasks.speaker_verification, module_name=Pipelines.sv_inference)
-class SpeakerVerificationPipeline(Pipeline):
-    """Speaker Verification Inference Pipeline
-    use `model` to create a Speaker Verification pipeline.
+    Tasks.audio_quantization,
+    module_name=Pipelines.audio_quantization_inference)
+class AudioQuantizationPipeline(Pipeline):
+    """Audio Quantization Inference Pipeline
+    use `model` to create a audio quantization pipeline.
 
     Args:
-        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        model (AudioQuantizationPipeline): A model instance, or a model local dir, or a model id in the model hub.
         kwargs (dict, `optional`):
             Extra kwargs passed into the preprocessor's constructor.
     Examples:
         >>> from modelscope.pipelines import pipeline
-        >>> pipeline_sv = pipeline(
-        >>>    task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch')
-        >>> audio_in=('sv_example_enroll.wav', 'sv_example_same.wav')
-        >>> print(pipeline_sv(audio_in))
-        >>> # {'label': ['Same', 'Different'], 'scores': [0.8540488358969999, 0.14595116410300013]}
+        >>> from modelscope.utils.constant import Tasks
+        >>> pipeline_aq = pipeline(
+        >>>    task=Tasks.audio_quantization,
+        >>>    model='damo/audio_codec-encodec-zh_en-general-16k-nq32ds640-pytorch'
+        >>> )
+        >>> audio_in='example.wav'
+        >>> print(pipeline_aq(audio_in))
 
     """
 
@@ -51,8 +54,8 @@ def __init__(self,
         self.model_cfg = self.model.forward()
         self.cmd = self.get_cmd(kwargs, model)
 
-        from funasr.bin import sv_inference_launch
-        self.funasr_infer_modelscope = sv_inference_launch.inference_launch(
+        from funcodec.bin import codec_inference
+        self.funasr_infer_modelscope = codec_inference.inference_modelscope(
             mode=self.cmd['mode'],
             output_dir=self.cmd['output_dir'],
             batch_size=self.cmd['batch_size'],
@@ -62,13 +65,14 @@ def __init__(self,
             num_workers=self.cmd['num_workers'],
             log_level=self.cmd['log_level'],
             key_file=self.cmd['key_file'],
-            sv_train_config=self.cmd['sv_train_config'],
-            sv_model_file=self.cmd['sv_model_file'],
+            config_file=self.cmd['config_file'],
+            model_file=self.cmd['model_file'],
             model_tag=self.cmd['model_tag'],
             allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
             streaming=self.cmd['streaming'],
-            embedding_node=self.cmd['embedding_node'],
-            sv_threshold=self.cmd['sv_threshold'],
+            sampling_rate=self.cmd['sampling_rate'],
+            bit_width=self.cmd['bit_width'],
+            use_scale=self.cmd['use_scale'],
             param_dict=self.cmd['param_dict'],
             **kwargs,
         )
@@ -78,7 +82,7 @@ def __call__(self,
                  output_dir: str = None,
                  param_dict: dict = None) -> Dict[str, Any]:
         if len(audio_in) == 0:
-            raise ValueError('The input of sv should not be null.')
+            raise ValueError('The input should not be null.')
         else:
             self.audio_in = audio_in
         if output_dir is not None:
@@ -94,19 +98,11 @@ def postprocess(self, inputs: list) -> Dict[str, Any]:
         """
         rst = {}
         for i in range(len(inputs)):
-            # for single input, re-formate the output
-            # audio_in:
-            #   list/tuple: return speaker verification scores
-            #   single wav/bytes: return speaker embedding
             if len(inputs) == 1 and i == 0:
-                if isinstance(self.audio_in, tuple) or isinstance(
-                        self.audio_in, list):
-                    score = inputs[0]['value']
-                    rst[OutputKeys.LABEL] = ['Same', 'Different']
-                    rst[OutputKeys.SCORES] = [score / 100.0, 1 - score / 100.0]
-                else:
-                    embedding = inputs[0]['value']
-                    rst[OutputKeys.SPK_EMBEDDING] = embedding
+                recon_wav = inputs[0]['value']
+                output_wav = recon_wav.cpu().numpy()[0]
+                output_wav = (output_wav * (2**15)).astype(np.int16)
+                rst[OutputKeys.OUTPUT_WAV] = output_wav
             else:
                 # for multiple inputs
                 rst[inputs[i]['key']] = inputs[i]['value']
@@ -115,10 +111,12 @@ def postprocess(self, inputs: list) -> Dict[str, Any]:
     def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
         # generate asr inference command
         mode = self.model_cfg['model_config']['mode']
-        sv_model_path = self.model_cfg['model_path']
-        sv_model_config = os.path.join(
+        _model_path = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['model_file'])
+        _model_config = os.path.join(
             self.model_cfg['model_workspace'],
-            self.model_cfg['model_config']['sv_model_config'])
+            self.model_cfg['model_config']['config_file'])
         update_local_model(self.model_cfg['model_config'], model_path,
                            extra_args)
         cmd = {
@@ -131,25 +129,27 @@ def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
             'num_workers': 0,
             'log_level': 'ERROR',
             'key_file': None,
-            'sv_model_file': sv_model_path,
-            'sv_train_config': sv_model_config,
+            'model_file': _model_path,
+            'config_file': _model_config,
             'model_tag': None,
             'allow_variable_data_keys': True,
             'streaming': False,
-            'embedding_node': 'resnet1_dense',
-            'sv_threshold': 0.9465,
+            'sampling_rate': 16000,
+            'bit_width': 8000,
+            'use_scale': True,
             'param_dict': None,
         }
         user_args_dict = [
             'output_dir',
             'batch_size',
             'ngpu',
-            'embedding_node',
-            'sv_threshold',
             'log_level',
             'allow_variable_data_keys',
             'streaming',
             'num_workers',
+            'sampling_rate',
+            'bit_width',
+            'use_scale',
             'param_dict',
         ]
 
@@ -181,69 +181,34 @@ def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
         """Decoding
         """
         # log  file_path/url or tuple (str, str)
-        if isinstance(audio_in, str) or \
-                (isinstance(audio_in, tuple) and all(isinstance(item, str) for item in audio_in)):
-            logger.info(f'Speaker Verification Processing: {audio_in} ...')
+        if isinstance(audio_in, str):
+            logger.info(f'Audio Quantization Processing: {audio_in} ...')
         else:
             logger.info(
-                f'Speaker Verification Processing: {str(audio_in)[:100]} ...')
+                f'Audio Quantization Processing: {str(audio_in)[:100]} ...')
 
         data_cmd, raw_inputs = None, None
-        if isinstance(audio_in, tuple) or isinstance(audio_in, list):
-            # generate audio_scp
-            assert len(audio_in) == 2
-            if isinstance(audio_in[0], str):
-                # for scp inputs
-                if len(audio_in[0].split(',')) == 3 and audio_in[0].split(
-                        ',')[0].endswith('.scp'):
-                    if len(audio_in[1].split(',')) == 3 and audio_in[1].split(
-                            ',')[0].endswith('.scp'):
-                        data_cmd = [
-                            tuple(audio_in[0].split(',')),
-                            tuple(audio_in[1].split(','))
-                        ]
-                # for single-file inputs
-                else:
-                    audio_scp_1, audio_scp_2 = generate_sv_scp_from_url(
-                        audio_in)
-                    if isinstance(audio_scp_1, bytes) and isinstance(
-                            audio_scp_2, bytes):
-                        data_cmd = [(audio_scp_1, 'speech', 'bytes'),
-                                    (audio_scp_2, 'ref_speech', 'bytes')]
-                    else:
-                        data_cmd = [(audio_scp_1, 'speech', 'sound'),
-                                    (audio_scp_2, 'ref_speech', 'sound')]
-            # for raw bytes inputs
-            elif isinstance(audio_in[0], bytes):
-                data_cmd = [(audio_in[0], 'speech', 'bytes'),
-                            (audio_in[1], 'ref_speech', 'bytes')]
+        if isinstance(audio_in, str):
+            # for scp inputs
+            if len(audio_in.split(',')) == 3:
+                data_cmd = [tuple(audio_in.split(','))]
+            # for single-file inputs
             else:
-                raise TypeError('Unsupported data type.')
+                audio_scp, _ = generate_scp_from_url(audio_in)
+                raw_inputs = audio_scp
+        # for raw bytes
+        elif isinstance(audio_in, bytes):
+            data_cmd = (audio_in, 'speech', 'bytes')
+        # for ndarray and tensor inputs
         else:
-            if isinstance(audio_in, str):
-                # for scp inputs
-                if len(audio_in.split(',')) == 3:
-                    data_cmd = [audio_in.split(',')]
-                # for single-file inputs
-                else:
-                    audio_scp = generate_scp_for_sv(audio_in)
-                    if isinstance(audio_scp, bytes):
-                        data_cmd = [(audio_scp, 'speech', 'bytes')]
-                    else:
-                        data_cmd = [(audio_scp, 'speech', 'sound')]
-            # for raw bytes
-            elif isinstance(audio_in, bytes):
-                data_cmd = [(audio_in, 'speech', 'bytes')]
-            # for ndarray and tensor inputs
+            import torch
+            import numpy as np
+            if isinstance(audio_in, torch.Tensor):
+                raw_inputs = audio_in
+            elif isinstance(audio_in, np.ndarray):
+                raw_inputs = audio_in
             else:
-                import torch
-                import numpy as np
-                if isinstance(audio_in, torch.Tensor):
-                    raw_inputs = audio_in
-                elif isinstance(audio_in, np.ndarray):
-                    raw_inputs = audio_in
-                else:
-                    raise TypeError('Unsupported data type.')
+                raise TypeError('Unsupported data type.')
 
         self.cmd['name_and_type'] = data_cmd
         self.cmd['raw_inputs'] = raw_inputs
diff --git a/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py b/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py
new file mode 100644
index 000000000..52de7d799
--- /dev/null
+++ b/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py
@@ -0,0 +1,276 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Optional, Union
+
+import json
+import numpy as np
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
+                                                update_local_model)
+from modelscope.utils.constant import Frameworks, ModelFile, Tasks
+from modelscope.utils.hub import snapshot_download
+from modelscope.utils.logger import get_logger
+
+__all__ = ['LauraCodecTTSPipeline']
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.text_to_speech, module_name=Pipelines.laura_codec_tts_inference)
+class LauraCodecTTSPipeline(Pipeline):
+    """Laura-style Codec-based TTS Inference Pipeline
+    use `model` to create a TTS pipeline.
+
+    Args:
+        model (LauraCodecTTSPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+    Examples:
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.utils.constant import Tasks
+        >>> my_pipeline = pipeline(
+        >>>    task=Tasks.text_to_speech,
+        >>>    model='damo/speech_synthesizer-laura-en-libritts-16k-codec_nq2-pytorch'
+        >>> )
+        >>> text='nothing was to be done but to put about, and return in disappointment towards the north.'
+        >>> prompt_text='one of these is context'
+        >>> prompt_speech='example/prompt.wav'
+        >>> print(my_pipeline(text))
+
+    """
+
+    def __init__(self,
+                 model: Union[Model, str] = None,
+                 codec_model: Optional[Union[Model, str]] = None,
+                 codec_model_revision: Optional[str] = None,
+                 ngpu: int = 1,
+                 **kwargs):
+        """use `model` to create an asr pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_cfg = self.model.forward()
+        self.codec_model = codec_model
+        self.codec_model_revision = codec_model_revision
+        self.cmd = self.get_cmd(kwargs, model)
+
+        from funcodec.bin import text2audio_inference
+        self.funasr_infer_modelscope = text2audio_inference.inference_func(
+            mode=self.cmd['mode'],
+            output_dir=self.cmd['output_dir'],
+            batch_size=self.cmd['batch_size'],
+            dtype=self.cmd['dtype'],
+            ngpu=ngpu,
+            seed=self.cmd['seed'],
+            num_workers=self.cmd['num_workers'],
+            log_level=self.cmd['log_level'],
+            key_file=self.cmd['key_file'],
+            config_file=self.cmd['config_file'],
+            model_file=self.cmd['model_file'],
+            model_tag=self.cmd['model_tag'],
+            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
+            streaming=self.cmd['streaming'],
+            text_emb_model=self.cmd['text_emb_model'],
+            beam_size=self.cmd['beam_size'],
+            sampling=self.cmd['sampling'],
+            continual=self.cmd['continual'],
+            tokenize_to_phone=self.cmd['tokenize_to_phone'],
+            exclude_prompt=self.cmd['exclude_prompt'],
+            codec_config_file=self.cmd['codec_config_file'],
+            codec_model_file=self.cmd['codec_model_file'],
+            param_dict=self.cmd['param_dict'])
+
+    def __call__(self,
+                 text: Union[tuple, str, Any] = None,
+                 prompt_text: Union[tuple, str, Any] = None,
+                 prompt_audio: Union[tuple, str, Any] = None,
+                 output_dir: str = None,
+                 param_dict: dict = None) -> Dict[str, Any]:
+        if len(text) == 0:
+            raise ValueError('The input should not be null.')
+        if output_dir is not None:
+            self.cmd['output_dir'] = output_dir
+        self.cmd['param_dict'] = param_dict
+
+        output = self.forward(text, prompt_text, prompt_audio)
+        result = self.postprocess(output)
+        return result
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        """Postprocessing
+        """
+        rst = {}
+        for i in range(len(inputs)):
+            if len(inputs) == 1 and i == 0:
+                recon_wav = inputs[0]['value']['gen']
+                rst[OutputKeys.OUTPUT_WAV] = recon_wav.cpu().numpy()[0]
+            else:
+                # for multiple inputs
+                rst[inputs[i]['key']] = inputs[i]['value']['gen']
+        return rst
+
+    def load_codec_model(self, cmd):
+        if self.codec_model is not None and self.codec_model != '':
+            if os.path.exists(self.codec_model):
+                codec_model = self.codec_model
+            else:
+                codec_model = snapshot_download(
+                    self.codec_model, revision=self.codec_model_revision)
+            logger.info('loading codec model from {0} ...'.format(codec_model))
+            config_path = os.path.join(codec_model, ModelFile.CONFIGURATION)
+            model_cfg = json.loads(open(config_path).read())
+            model_dir = os.path.dirname(config_path)
+            cmd['codec_model_file'] = os.path.join(
+                model_dir, model_cfg['model']['model_config']['model_file'])
+            cmd['codec_config_file'] = os.path.join(
+                model_dir, model_cfg['model']['model_config']['config_file'])
+
+    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
+        # generate asr inference command
+        mode = self.model_cfg['model_config']['mode']
+        _model_path = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['model_file'])
+        _model_config = os.path.join(
+            self.model_cfg['model_workspace'],
+            self.model_cfg['model_config']['config_file'])
+        update_local_model(self.model_cfg['model_config'], model_path,
+                           extra_args)
+
+        cmd = {
+            'mode': mode,
+            'output_dir': None,
+            'batch_size': 1,
+            'dtype': 'float32',
+            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
+            'seed': 0,
+            'num_workers': 0,
+            'log_level': 'ERROR',
+            'key_file': None,
+            'model_file': _model_path,
+            'config_file': _model_config,
+            'model_tag': None,
+            'allow_variable_data_keys': True,
+            'streaming': False,
+            'beam_size': 1,
+            'sampling': 25,
+            'text_emb_model': None,
+            'continual': True,
+            'tokenize_to_phone': True,
+            'exclude_prompt': True,
+            'codec_model_file': None,
+            'codec_config_file': None,
+            'param_dict': None,
+        }
+        user_args_dict = [
+            'output_dir',
+            'batch_size',
+            'ngpu',
+            'log_level',
+            'allow_variable_data_keys',
+            'streaming',
+            'num_workers',
+            'sampling_rate',
+            'bit_width',
+            'use_scale',
+            'param_dict',
+        ]
+
+        model_config = self.model_cfg['model_config']
+        if model_config.__contains__(
+                'codec_model') and self.codec_model is None:
+            self.codec_model = model_config['codec_model']
+        if model_config.__contains__(
+                'codec_model_revision') and self.codec_model_revision is None:
+            self.codec_model_revision = model_config['codec_model_revision']
+        self.load_codec_model(cmd)
+
+        # re-write the config with configure.json
+        for user_args in user_args_dict:
+            if (user_args in self.model_cfg['model_config']
+                    and self.model_cfg['model_config'][user_args] is not None):
+                if isinstance(cmd[user_args], dict) and isinstance(
+                        self.model_cfg['model_config'][user_args], dict):
+                    cmd[user_args].update(
+                        self.model_cfg['model_config'][user_args])
+                else:
+                    cmd[user_args] = self.model_cfg['model_config'][user_args]
+
+        # rewrite the config with user args
+        for user_args in user_args_dict:
+            if user_args in extra_args:
+                if extra_args.get(user_args) is not None:
+                    if isinstance(cmd[user_args], dict) and isinstance(
+                            extra_args[user_args], dict):
+                        cmd[user_args].update(extra_args[user_args])
+                    else:
+                        cmd[user_args] = extra_args[user_args]
+                del extra_args[user_args]
+
+        return cmd
+
+    def forward(self,
+                text: Union[tuple, str, Any] = None,
+                prompt_text: Union[tuple, str, Any] = None,
+                prompt_audio: Union[tuple, str, Any] = None,
+                **forward_params) -> list:
+        """Decoding
+        """
+        if isinstance(text, str):
+            logger.info(f'Generate speech for: {text} ...')
+
+        data_cmd, raw_inputs = None, None
+        # process text input
+        # for scp inputs
+        if len(text.split(',')) == 3:
+            data_cmd = [tuple(text.split(','))]
+        # for single-file inputs
+        else:
+            raw_inputs = [text]
+
+        if prompt_text is not None and prompt_audio is not None:
+            if len(prompt_text.split(',')) == 3:
+                data_cmd.append(tuple(prompt_text.split(',')))
+            else:
+                raw_inputs.append(prompt_text)
+
+            if isinstance(prompt_audio, str):
+                if len(prompt_audio.split(',')) == 3:
+                    data_cmd.append(tuple(prompt_audio.split(',')))
+                else:
+                    audio_path, _ = generate_scp_from_url(prompt_audio)
+                    raw_inputs.append(audio_path)
+            # for ndarray and tensor inputs
+            else:
+                import torch
+                if isinstance(prompt_audio, torch.Tensor):
+                    raw_inputs.append(prompt_audio.numpy())
+                elif isinstance(prompt_audio, np.ndarray):
+                    raw_inputs.append(prompt_audio)
+                else:
+                    raise TypeError(
+                        f'Unsupported prompt audio type {type(prompt_audio)}.')
+
+        self.cmd['name_and_type'] = data_cmd
+        self.cmd['raw_inputs'] = raw_inputs
+        result = self.run_inference(self.cmd)
+
+        return result
+
+    def run_inference(self, cmd):
+        if self.framework == Frameworks.torch:
+            sv_result = self.funasr_infer_modelscope(
+                data_path_and_name_and_type=cmd['name_and_type'],
+                raw_inputs=cmd['raw_inputs'],
+                output_dir_v2=cmd['output_dir'],
+                param_dict=cmd['param_dict'])
+        else:
+            raise ValueError('model type is mismatching')
+
+        return sv_result
diff --git a/modelscope/pipelines/audio/funasr_pipeline.py b/modelscope/pipelines/audio/funasr_pipeline.py
new file mode 100644
index 000000000..4b66b6ab2
--- /dev/null
+++ b/modelscope/pipelines/audio/funasr_pipeline.py
@@ -0,0 +1,75 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+from typing import Any, Dict, List, Sequence, Tuple, Union
+
+import json
+import yaml
+
+from modelscope.metainfo import Pipelines
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
+                                                update_local_model)
+from modelscope.utils.constant import Frameworks, ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+__all__ = ['FunASRPipeline']
+
+
+@PIPELINES.register_module(
+    Tasks.auto_speech_recognition, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.voice_activity_detection, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.language_score_prediction, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.punctuation, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.speaker_diarization, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.speaker_verification, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.speech_separation, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.speech_timestamp, module_name=Pipelines.funasr_pipeline)
+@PIPELINES.register_module(
+    Tasks.emotion_recognition, module_name=Pipelines.funasr_pipeline)
+class FunASRPipeline(Pipeline):
+    """Voice Activity Detection Inference Pipeline
+    use `model` to create a Voice Activity Detection pipeline.
+
+    Args:
+        model: A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the preprocessor's constructor.
+
+    Example:
+        >>> from modelscope.pipelines import pipeline
+        >>> p = pipeline(
+        >>>    task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch')
+        >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm'
+        >>> print(p(audio_in))
+
+    """
+
+    def __init__(self, model: Union[Model, str] = None, **kwargs):
+        """use `model` to create an vad pipeline for prediction
+        """
+        super().__init__(model=model, **kwargs)
+
+    def __call__(self, *args, **kwargs) -> Dict[str, Any]:
+        """
+        Decoding the input audios
+        Args:
+            input('str' or 'bytes'):
+        Return:
+            a list of dictionary of result.
+        """
+
+        output = self.model(*args, **kwargs)
+
+        return output
diff --git a/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py b/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py
index 1b9c7f799..0865bdfef 100644
--- a/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py
+++ b/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py
@@ -55,24 +55,34 @@ def __call__(self,
                  in_audios: Union[str, list, np.ndarray],
                  out_file: str = None):
         wavs = self.preprocess(in_audios)
-        results = self.forward(wavs)
-        outputs = self.postprocess(results, in_audios, out_file)
+        scores, results = self.forward(wavs)
+        outputs = self.postprocess(results, scores, in_audios, out_file)
         return outputs
 
     def forward(self, inputs: list):
+        scores = []
         results = []
         for x in inputs:
-            results.append(self.model(x).item())
-        return results
+            score, result = self.model(x)
+            scores.append(score.tolist())
+            results.append(result.item())
+        return scores, results
 
     def postprocess(self,
                     inputs: list,
+                    scores: list,
                     in_audios: Union[str, list, np.ndarray],
                     out_file=None):
         if isinstance(in_audios, str):
-            output = {OutputKeys.TEXT: self.languages[inputs[0]]}
+            output = {
+                OutputKeys.TEXT: self.languages[inputs[0]],
+                OutputKeys.SCORE: scores
+            }
         else:
-            output = {OutputKeys.TEXT: [self.languages[i] for i in inputs]}
+            output = {
+                OutputKeys.TEXT: [self.languages[i] for i in inputs],
+                OutputKeys.SCORE: scores
+            }
             if out_file is not None:
                 out_lines = []
                 for i, audio in enumerate(in_audios):
diff --git a/modelscope/pipelines/audio/language_recognition_pipeline.py b/modelscope/pipelines/audio/language_recognition_pipeline.py
index 00adcfff4..353232d7b 100644
--- a/modelscope/pipelines/audio/language_recognition_pipeline.py
+++ b/modelscope/pipelines/audio/language_recognition_pipeline.py
@@ -55,24 +55,34 @@ def __call__(self,
                  in_audios: Union[str, list, np.ndarray],
                  out_file: str = None):
         wavs = self.preprocess(in_audios)
-        results = self.forward(wavs)
-        outputs = self.postprocess(results, in_audios, out_file)
+        scores, results = self.forward(wavs)
+        outputs = self.postprocess(results, scores, in_audios, out_file)
         return outputs
 
     def forward(self, inputs: list):
+        scores = []
         results = []
         for x in inputs:
-            results.append(self.model(x).item())
-        return results
+            score, result = self.model(x)
+            scores.append(score.tolist())
+            results.append(result.item())
+        return scores, results
 
     def postprocess(self,
                     inputs: list,
+                    scores: list,
                     in_audios: Union[str, list, np.ndarray],
                     out_file=None):
         if isinstance(in_audios, str):
-            output = {OutputKeys.TEXT: self.languages[inputs[0]]}
+            output = {
+                OutputKeys.TEXT: self.languages[inputs[0]],
+                OutputKeys.SCORE: scores
+            }
         else:
-            output = {OutputKeys.TEXT: [self.languages[i] for i in inputs]}
+            output = {
+                OutputKeys.TEXT: [self.languages[i] for i in inputs],
+                OutputKeys.SCORE: scores
+            }
             if out_file is not None:
                 out_lines = []
                 for i, audio in enumerate(in_audios):
diff --git a/modelscope/pipelines/audio/lm_infer_pipeline.py b/modelscope/pipelines/audio/lm_infer_pipeline.py
deleted file mode 100644
index e1524ebd3..000000000
--- a/modelscope/pipelines/audio/lm_infer_pipeline.py
+++ /dev/null
@@ -1,230 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict, Union
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.audio.audio_utils import (generate_text_from_url,
-                                                update_local_model)
-from modelscope.utils.config import Config
-from modelscope.utils.constant import Frameworks, ModelFile, Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['LanguageModelPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.language_score_prediction, module_name=Pipelines.lm_inference)
-class LanguageModelPipeline(Pipeline):
-    """Language Model Inference Pipeline
-
-    Example:
-    >>> from modelscope.pipelines import pipeline
-    >>> from modelscope.utils.constant import Tasks
-
-    >>> inference_pipeline = pipeline(
-    >>>    task=Tasks.language_score_prediction,
-    >>>    model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch')
-    >>> text_in='hello 大 家 好 呀'
-    >>> print(inference_pipeline(text_in))
-
-    """
-
-    def __init__(self,
-                 model: Union[Model, str] = None,
-                 ngpu: int = 1,
-                 **kwargs):
-        """
-        Use `model` to create a LM pipeline for prediction
-        Args:
-            model ('Model' or 'str'):
-                The pipeline handles three types of model:
-
-                - A model instance
-                - A model local dir
-                - A model id in the model hub
-            output_dir('str'):
-                output dir path
-            batch_size('int'):
-                the batch size for inference
-            ngpu('int'):
-                the number of gpus, 0 indicates CPU mode
-            model_file('str'):
-                LM model file
-            train_config('str'):
-                LM infer configuration
-            num_workers('int'):
-                the number of workers used for DataLoader
-            log_level('str'):
-                log level
-            log_base('float', defaults to 10.0):
-                the base of logarithm for Perplexity
-            split_with_space('bool'):
-                split the input sentence by space
-            seg_dict_file('str'):
-                seg dict file
-            param_dict('dict'):
-                extra kwargs
-        """
-        super().__init__(model=model, **kwargs)
-        config_path = os.path.join(model, ModelFile.CONFIGURATION)
-        self.cmd = self.get_cmd(config_path, kwargs, model)
-
-        from funasr.bin import lm_inference_launch
-        self.funasr_infer_modelscope = lm_inference_launch.inference_launch(
-            mode=self.cmd['mode'],
-            batch_size=self.cmd['batch_size'],
-            dtype=self.cmd['dtype'],
-            ngpu=ngpu,
-            seed=self.cmd['seed'],
-            num_workers=self.cmd['num_workers'],
-            log_level=self.cmd['log_level'],
-            key_file=self.cmd['key_file'],
-            train_config=self.cmd['train_config'],
-            model_file=self.cmd['model_file'],
-            log_base=self.cmd['log_base'],
-            split_with_space=self.cmd['split_with_space'],
-            seg_dict_file=self.cmd['seg_dict_file'],
-            output_dir=self.cmd['output_dir'],
-            param_dict=self.cmd['param_dict'],
-            **kwargs,
-        )
-
-    def __call__(self,
-                 text_in: str = None,
-                 output_dir: str = None,
-                 param_dict: dict = None) -> Dict[str, Any]:
-        """
-        Compute PPL
-        Args:
-            text_in('str'):
-                - A text str input
-                - A local text file input endswith .txt or .scp
-                - A url text file input
-            output_dir('str'):
-                output dir
-            param_dict('dict'):
-                extra kwargs
-        Return:
-            A dictionary of result or a list of dictionary of result.
-
-            The dictionary contain the following keys:
-            - **text** ('str') --The PPL result.
-        """
-        if len(text_in) == 0:
-            raise ValueError('The input of lm should not be null.')
-        else:
-            self.text_in = text_in
-        if output_dir is not None:
-            self.cmd['output_dir'] = output_dir
-        if param_dict is not None:
-            self.cmd['param_dict'] = param_dict
-
-        output = self.forward(self.text_in)
-        result = self.postprocess(output)
-        return result
-
-    def postprocess(self, inputs: list) -> Dict[str, Any]:
-        """Postprocessing
-        """
-        rst = {}
-        for i in range(len(inputs)):
-            if i == 0:
-                text = inputs[0]['value']
-                if len(text) > 0:
-                    rst[OutputKeys.TEXT] = text
-            else:
-                rst[inputs[i]['key']] = inputs[i]['value']
-        return rst
-
-    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
-        # generate inference command
-        model_cfg = Config.from_file(config_path)
-        model_dir = os.path.dirname(config_path)
-        mode = model_cfg.model['model_config']['mode']
-        lm_model_path = os.path.join(
-            model_dir, model_cfg.model['model_config']['lm_model_name'])
-        lm_model_config = os.path.join(
-            model_dir, model_cfg.model['model_config']['lm_model_config'])
-        seg_dict_file = None
-        if 'seg_dict_file' in model_cfg.model['model_config']:
-            seg_dict_file = os.path.join(
-                model_dir, model_cfg.model['model_config']['seg_dict_file'])
-        update_local_model(model_cfg.model['model_config'], model_path,
-                           extra_args)
-
-        cmd = {
-            'mode': mode,
-            'batch_size': 1,
-            'dtype': 'float32',
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'seed': 0,
-            'num_workers': 0,
-            'log_level': 'ERROR',
-            'key_file': None,
-            'train_config': lm_model_config,
-            'model_file': lm_model_path,
-            'log_base': 10.0,
-            'allow_variable_data_keys': False,
-            'split_with_space': True,
-            'seg_dict_file': seg_dict_file,
-            'output_dir': None,
-            'param_dict': None,
-        }
-
-        user_args_dict = [
-            'batch_size',
-            'ngpu',
-            'num_workers',
-            'log_level',
-            'train_config',
-            'model_file',
-            'log_base',
-            'split_with_space',
-            'seg_dict_file',
-            'output_dir',
-            'param_dict',
-        ]
-
-        for user_args in user_args_dict:
-            if user_args in extra_args:
-                if extra_args.get(user_args) is not None:
-                    cmd[user_args] = extra_args[user_args]
-                del extra_args[user_args]
-
-        return cmd
-
-    def forward(self, text_in: str = None) -> list:
-        """Decoding
-        """
-        logger.info('Compute PPL : {0} ...'.format(text_in))
-        # generate text_in
-        text_file, raw_inputs = generate_text_from_url(text_in)
-        data_cmd = None
-        if raw_inputs is None:
-            data_cmd = [(text_file, 'text', 'text')]
-        elif text_file is None and raw_inputs is not None:
-            data_cmd = None
-
-        self.cmd['name_and_type'] = data_cmd
-        self.cmd['raw_inputs'] = raw_inputs
-        lm_result = self.run_inference(self.cmd)
-
-        return lm_result
-
-    def run_inference(self, cmd):
-        if self.framework == Frameworks.torch:
-            lm_result = self.funasr_infer_modelscope(
-                data_path_and_name_and_type=cmd['name_and_type'],
-                raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'],
-                param_dict=cmd['param_dict'])
-        else:
-            raise ValueError('model type is mismatching')
-
-        return lm_result
diff --git a/modelscope/pipelines/audio/punctuation_processing_pipeline.py b/modelscope/pipelines/audio/punctuation_processing_pipeline.py
deleted file mode 100644
index 4e41e0c09..000000000
--- a/modelscope/pipelines/audio/punctuation_processing_pipeline.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-import shutil
-from typing import Any, Dict, List, Sequence, Tuple, Union
-
-import yaml
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.audio.audio_utils import (generate_text_from_url,
-                                                update_local_model)
-from modelscope.utils.constant import Frameworks, Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['PunctuationProcessingPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.punctuation, module_name=Pipelines.punc_inference)
-class PunctuationProcessingPipeline(Pipeline):
-    """Punctuation Processing Inference Pipeline
-    use `model` to create a Punctuation Processing pipeline.
-
-    Args:
-        model (PunctuationProcessingPipeline): A model instance, or a model local dir, or a model id in the model hub.
-        kwargs (dict, `optional`):
-            Extra kwargs passed into the preprocessor's constructor.
-    Examples
-    >>> from modelscope.pipelines import pipeline
-    >>> pipeline_punc = pipeline(
-    >>>    task=Tasks.punctuation, model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch')
-    >>> text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt'
-    >>> print(pipeline_punc(text_in))
-
-    """
-
-    def __init__(self,
-                 model: Union[Model, str] = None,
-                 ngpu: int = 1,
-                 **kwargs):
-        """use `model` to create an asr pipeline for prediction
-        """
-        super().__init__(model=model, **kwargs)
-        self.model_cfg = self.model.forward()
-        self.cmd = self.get_cmd(kwargs, model)
-
-        from funasr.bin import punc_inference_launch
-        self.funasr_infer_modelscope = punc_inference_launch.inference_launch(
-            mode=self.cmd['mode'],
-            batch_size=self.cmd['batch_size'],
-            dtype=self.cmd['dtype'],
-            ngpu=ngpu,
-            seed=self.cmd['seed'],
-            num_workers=self.cmd['num_workers'],
-            log_level=self.cmd['log_level'],
-            key_file=self.cmd['key_file'],
-            train_config=self.cmd['train_config'],
-            model_file=self.cmd['model_file'],
-            output_dir=self.cmd['output_dir'],
-            param_dict=self.cmd['param_dict'],
-            **kwargs,
-        )
-
-    def __call__(self,
-                 text_in: str = None,
-                 output_dir: str = None,
-                 cache: List[Any] = None,
-                 param_dict: dict = None) -> Dict[str, Any]:
-        if len(text_in) == 0:
-            raise ValueError('The input of punctuation should not be null.')
-        else:
-            self.text_in = text_in
-        if output_dir is not None:
-            self.cmd['output_dir'] = output_dir
-        if cache is not None:
-            self.cmd['cache'] = cache
-        if param_dict is not None:
-            self.cmd['param_dict'] = param_dict
-
-        output = self.forward(self.text_in)
-        result = self.postprocess(output)
-        return result
-
-    def postprocess(self, inputs: list) -> Dict[str, Any]:
-        """Postprocessing
-        """
-        rst = {}
-        for i in range(len(inputs)):
-            if i == 0:
-                for key, value in inputs[0].items():
-                    if key == 'value':
-                        if len(value) > 0:
-                            rst[OutputKeys.TEXT] = value
-                    elif key != 'key':
-                        rst[key] = value
-            else:
-                rst[inputs[i]['key']] = inputs[i]['value']
-        return rst
-
-    def get_cmd(self, extra_args, model_path) -> Dict[str, Any]:
-        # generate inference command
-        lang = self.model_cfg['model_config']['lang']
-        punc_model_path = self.model_cfg['punc_model_path']
-        punc_model_config = os.path.join(
-            self.model_cfg['model_workspace'],
-            self.model_cfg['model_config']['punc_config'])
-        mode = self.model_cfg['model_config']['mode']
-        update_local_model(self.model_cfg['model_config'], model_path,
-                           extra_args)
-        cmd = {
-            'mode': mode,
-            'batch_size': 1,
-            'dtype': 'float32',
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'seed': 0,
-            'num_workers': 0,
-            'log_level': 'ERROR',
-            'key_file': None,
-            'train_config': punc_model_config,
-            'model_file': punc_model_path,
-            'output_dir': None,
-            'lang': lang,
-            'cache': None,
-            'param_dict': None,
-        }
-
-        user_args_dict = [
-            'batch_size',
-            'dtype',
-            'ngpu',
-            'seed',
-            'num_workers',
-            'log_level',
-            'train_config',
-            'model_file',
-            'output_dir',
-            'lang',
-            'param_dict',
-        ]
-
-        for user_args in user_args_dict:
-            if user_args in extra_args:
-                if extra_args.get(user_args) is not None:
-                    cmd[user_args] = extra_args[user_args]
-                del extra_args[user_args]
-
-        return cmd
-
-    def forward(self, text_in: str = None) -> list:
-        """Decoding
-        """
-        logger.info('Punctuation Processing: {0} ...'.format(text_in))
-        # generate text_in
-        text_file, raw_inputs = generate_text_from_url(text_in)
-        if raw_inputs is None:
-            data_cmd = [(text_file, 'text', 'text')]
-        elif text_file is None and raw_inputs is not None:
-            data_cmd = None
-
-        self.cmd['name_and_type'] = data_cmd
-        self.cmd['raw_inputs'] = raw_inputs
-        punc_result = self.run_inference(self.cmd)
-
-        return punc_result
-
-    def run_inference(self, cmd):
-        punc_result = ''
-        if self.framework == Frameworks.torch:
-            punc_result = self.funasr_infer_modelscope(
-                data_path_and_name_and_type=cmd['name_and_type'],
-                raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'],
-                cache=cmd['cache'],
-                param_dict=cmd['param_dict'])
-        else:
-            raise ValueError('model type is mismatching')
-
-        return punc_result
diff --git a/modelscope/pipelines/audio/segmentation_clustering_pipeline.py b/modelscope/pipelines/audio/segmentation_clustering_pipeline.py
index e4810bcfe..9f6961e2d 100644
--- a/modelscope/pipelines/audio/segmentation_clustering_pipeline.py
+++ b/modelscope/pipelines/audio/segmentation_clustering_pipeline.py
@@ -179,16 +179,17 @@ def preprocess(self, audio: Union[str, np.ndarray, list]) -> list:
         if not hasattr(self, 'vad_pipeline'):
             self.vad_pipeline = pipeline(
                 task=Tasks.voice_activity_detection,
-                model=self.config['vad_model'])
-        vad_time = self.vad_pipeline(audio, audio_fs=self.fs)
+                model=self.config['vad_model'],
+                model_revision='v2.0.2')
+        vad_time = self.vad_pipeline(
+            audio, fs=self.fs, is_final=True)[0]['value']
         vad_segments = []
-        if isinstance(vad_time['text'], str):
-            vad_time_list = ast.literal_eval(vad_time['text'])
-        elif isinstance(vad_time['text'], list):
-            vad_time_list = vad_time['text']
+        if isinstance(vad_time, str):
+            vad_time_list = ast.literal_eval(vad_time)
+        elif isinstance(vad_time, list):
+            vad_time_list = vad_time
         else:
-            raise ValueError('Incorrect vad result. Get %s' %
-                             (type(vad_time['text'])))
+            raise ValueError('Incorrect vad result. Get %s' % (type(vad_time)))
         for t in vad_time_list:
             st = int(t[0]) / 1000
             ed = int(t[1]) / 1000
diff --git a/modelscope/pipelines/audio/speaker_diarization_pipeline.py b/modelscope/pipelines/audio/speaker_diarization_pipeline.py
deleted file mode 100644
index dfb808d04..000000000
--- a/modelscope/pipelines/audio/speaker_diarization_pipeline.py
+++ /dev/null
@@ -1,287 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-import shutil
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
-
-import json
-import numpy
-import yaml
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.audio.audio_utils import (generate_scp_for_sv,
-                                                generate_sd_scp_from_url,
-                                                update_local_model)
-from modelscope.utils.constant import Frameworks, ModelFile, Tasks
-from modelscope.utils.hub import snapshot_download
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['SpeakerDiarizationPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.speaker_diarization,
-    module_name=Pipelines.speaker_diarization_inference)
-class SpeakerDiarizationPipeline(Pipeline):
-    """Speaker Diarization Inference Pipeline
-    use `model` to create a Speaker Diarization pipeline.
-
-    Args:
-        model (SpeakerDiarizationPipeline): A model instance, or a model local dir, or a model id in the model hub.
-        kwargs (dict, `optional`):
-            Extra kwargs passed into the preprocessor's constructor.
-    Examples:
-        >>> from modelscope.pipelines import pipeline
-        >>> pipeline_sd = pipeline(
-        >>>    task=Tasks.speaker_diarization, model='damo/xxxxxxxxxxxxx')
-        >>> audio_in=('','','','')
-        >>> print(pipeline_sd(audio_in))
-
-    """
-
-    def __init__(self,
-                 model: Union[Model, str] = None,
-                 sv_model: Optional[Union[Model, str]] = None,
-                 sv_model_revision: Optional[str] = None,
-                 ngpu: int = 1,
-                 **kwargs):
-        """use `model` to create a speaker diarization pipeline for prediction
-        Args:
-            model ('Model' or 'str'):
-                The pipeline handles three types of model:
-
-                - A model instance
-                - A model local dir
-                - A model id in the model hub
-            sv_model (Optional: 'Model' or 'str'):
-                speaker verification model from model hub or local
-                example: 'damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
-            sv_model_revision (Optional: 'str'):
-                speaker verfication model revision from model hub
-        """
-        super().__init__(model=model, **kwargs)
-        self.model_cfg = None
-        config_path = os.path.join(model, ModelFile.CONFIGURATION)
-        self.sv_model = sv_model
-        self.sv_model_revision = sv_model_revision
-        self.cmd = self.get_cmd(config_path, kwargs, model)
-
-        from funasr.bin import diar_inference_launch
-        self.funasr_infer_modelscope = diar_inference_launch.inference_launch(
-            mode=self.cmd['mode'],
-            output_dir=self.cmd['output_dir'],
-            batch_size=self.cmd['batch_size'],
-            dtype=self.cmd['dtype'],
-            ngpu=ngpu,
-            seed=self.cmd['seed'],
-            num_workers=self.cmd['num_workers'],
-            log_level=self.cmd['log_level'],
-            key_file=self.cmd['key_file'],
-            diar_train_config=self.cmd['diar_train_config'],
-            diar_model_file=self.cmd['diar_model_file'],
-            model_tag=self.cmd['model_tag'],
-            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
-            streaming=self.cmd['streaming'],
-            smooth_size=self.cmd['smooth_size'],
-            dur_threshold=self.cmd['dur_threshold'],
-            out_format=self.cmd['out_format'],
-            param_dict=self.cmd['param_dict'],
-            **kwargs,
-        )
-
-    def __call__(self,
-                 audio_in: Union[tuple, str, Any] = None,
-                 output_dir: str = None,
-                 param_dict: dict = None) -> Dict[str, Any]:
-        """
-        Decoding the input audios
-        Args:
-            audio_in('str' or 'bytes'):
-                - A string containing a local path to a wav file
-                - A string containing a local path to a scp
-                - A string containing a wav url
-                - A bytes input
-            output_dir('str'):
-                output dir
-            param_dict('dict'):
-                extra kwargs
-        Return:
-            A dictionary of result or a list of dictionary of result.
-
-            The dictionary contain the following keys:
-            - **text** ('str') --The speaker diarization result.
-        """
-        if len(audio_in) == 0:
-            raise ValueError('The input of sv should not be null.')
-        else:
-            self.audio_in = audio_in
-        if output_dir is not None:
-            self.cmd['output_dir'] = output_dir
-        self.cmd['param_dict'] = param_dict
-
-        output = self.forward(self.audio_in)
-        result = self.postprocess(output)
-        return result
-
-    def postprocess(self, inputs: list) -> Dict[str, Any]:
-        """Postprocessing
-        """
-        rst = {}
-        for i in range(len(inputs)):
-            # for demo service
-            if i == 0 and len(inputs) == 1:
-                rst[OutputKeys.TEXT] = inputs[0]['value']
-            else:
-                rst[inputs[i]['key']] = inputs[i]['value']
-        return rst
-
-    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
-        self.model_cfg = json.loads(open(config_path).read())
-        model_dir = os.path.dirname(config_path)
-        # generate sd inference command
-        mode = self.model_cfg['model']['model_config']['mode']
-        diar_model_path = os.path.join(
-            model_dir,
-            self.model_cfg['model']['model_config']['diar_model_name'])
-        diar_model_config = os.path.join(
-            model_dir,
-            self.model_cfg['model']['model_config']['diar_model_config'])
-        update_local_model(self.model_cfg['model']['model_config'], model_path,
-                           extra_args)
-        cmd = {
-            'mode': mode,
-            'output_dir': None,
-            'batch_size': 1,
-            'dtype': 'float32',
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'seed': 0,
-            'num_workers': 0,
-            'log_level': 'ERROR',
-            'key_file': None,
-            'diar_model_file': diar_model_path,
-            'diar_train_config': diar_model_config,
-            'model_tag': None,
-            'allow_variable_data_keys': True,
-            'streaming': False,
-            'smooth_size': 83,
-            'dur_threshold': 10,
-            'out_format': 'vad',
-            'param_dict': {
-                'sv_model_file': None,
-                'sv_train_config': None
-            },
-        }
-        user_args_dict = [
-            'mode',
-            'output_dir',
-            'batch_size',
-            'ngpu',
-            'log_level',
-            'allow_variable_data_keys',
-            'streaming',
-            'num_workers',
-            'smooth_size',
-            'dur_threshold',
-            'out_format',
-            'param_dict',
-        ]
-        model_config = self.model_cfg['model']['model_config']
-        if model_config.__contains__('sv_model') and self.sv_model != '':
-            self.sv_model = model_config['sv_model']
-        if model_config.__contains__('sv_model_revision'):
-            self.sv_model_revision = model_config['sv_model_revision']
-        self.load_sv_model(cmd)
-
-        # rewrite the config with user args
-        for user_args in user_args_dict:
-            if user_args in extra_args:
-                if extra_args.get(user_args) is not None:
-                    if isinstance(cmd[user_args], dict) and isinstance(
-                            extra_args[user_args], dict):
-                        cmd[user_args].update(extra_args[user_args])
-                    else:
-                        cmd[user_args] = extra_args[user_args]
-                del extra_args[user_args]
-
-        return cmd
-
-    def load_sv_model(self, cmd):
-        if self.sv_model is not None and self.sv_model != '':
-            if os.path.exists(self.sv_model):
-                sv_model = self.sv_model
-            else:
-                sv_model = snapshot_download(
-                    self.sv_model, revision=self.sv_model_revision)
-            logger.info(
-                'loading speaker verification model from {0} ...'.format(
-                    sv_model))
-            config_path = os.path.join(sv_model, ModelFile.CONFIGURATION)
-            model_cfg = json.loads(open(config_path).read())
-            model_dir = os.path.dirname(config_path)
-            cmd['param_dict']['sv_model_file'] = os.path.join(
-                model_dir, model_cfg['model']['model_config']['sv_model_name'])
-            cmd['param_dict']['sv_train_config'] = os.path.join(
-                model_dir,
-                model_cfg['model']['model_config']['sv_model_config'])
-
-    def forward(self, audio_in: Union[tuple, str, Any] = None) -> list:
-        """Decoding
-        """
-        # log  file_path/url or tuple (str, str)
-        if isinstance(audio_in, str) or \
-                (isinstance(audio_in, tuple) and all(isinstance(item, str) for item in audio_in)):
-            logger.info(f'Speaker Verification Processing: {audio_in} ...')
-        else:
-            logger.info(
-                f'Speaker Verification Processing: {str(audio_in)[:100]} ...')
-
-        data_cmd, raw_inputs = None, None
-        if isinstance(audio_in, tuple) or isinstance(audio_in, list):
-            # generate audio_scp
-            if isinstance(audio_in[0], str):
-                # for scp inputs
-                if len(audio_in[0].split(',')) == 3 and audio_in[0].split(
-                        ',')[0].endswith('.scp'):
-                    data_cmd = []
-                    for audio_cmd in audio_in:
-                        if len(audio_cmd.split(',')) == 3 and audio_cmd.split(
-                                ',')[0].endswith('.scp'):
-                            data_cmd.append(tuple(audio_cmd.split(',')))
-                # for audio-list inputs
-                else:
-                    raw_inputs = generate_sd_scp_from_url(audio_in)
-            # for raw bytes inputs
-            elif isinstance(audio_in[0], (bytes, numpy.ndarray)):
-                raw_inputs = audio_in
-            else:
-                raise TypeError(
-                    'Unsupported data type, it must be data_name_type_path, '
-                    'file_path, url, bytes or numpy.ndarray')
-        else:
-            raise TypeError(
-                'audio_in must be a list of data_name_type_path, file_path, '
-                'url, bytes or numpy.ndarray')
-
-        self.cmd['name_and_type'] = data_cmd
-        self.cmd['raw_inputs'] = raw_inputs
-        result = self.run_inference(self.cmd)
-
-        return result
-
-    def run_inference(self, cmd):
-        if self.framework == Frameworks.torch:
-            diar_result = self.funasr_infer_modelscope(
-                data_path_and_name_and_type=cmd['name_and_type'],
-                raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'],
-                param_dict=cmd['param_dict'])
-        else:
-            raise ValueError(
-                'framework is mismatching, which should be pytorch')
-
-        return diar_result
diff --git a/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py b/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py
index ba28ed6e2..507e761df 100644
--- a/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py
+++ b/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py
@@ -1,6 +1,7 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import io
+import os
 from typing import Any, Dict, List, Union
 
 import numpy as np
diff --git a/modelscope/pipelines/audio/speaker_verification_eres2netv2_pipeline.py b/modelscope/pipelines/audio/speaker_verification_eres2netv2_pipeline.py
new file mode 100644
index 000000000..edac14446
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_eres2netv2_pipeline.py
@@ -0,0 +1,160 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+import os
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import InputModel, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification,
+    module_name=Pipelines.speaker_verification_eres2netv2)
+class ERes2NetV2_Pipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the pipeline's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> p = pipeline(
+    >>>    task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k')
+    >>> print(p([audio_1, audio_2]))
+
+    """
+
+    def __init__(self, model: InputModel, **kwargs):
+        """use `model` to create a speaker verification pipeline for prediction
+        Args:
+            model (str): a valid offical model id
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_config = self.model.model_config
+        self.config = self.model.other_config
+        self.thr = self.config['yesOrno_thr']
+        self.save_dict = {}
+
+    def __call__(self,
+                 in_audios: Union[np.ndarray, list],
+                 save_dir: str = None,
+                 output_emb: bool = False,
+                 thr: float = None):
+        if thr is not None:
+            self.thr = thr
+        if self.thr < -1 or self.thr > 1:
+            raise ValueError(
+                'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
+                % self.thr)
+        wavs = self.preprocess(in_audios)
+        embs = self.forward(wavs)
+        outputs = self.postprocess(embs, in_audios, save_dir)
+        if output_emb:
+            self.save_dict['outputs'] = outputs
+            self.save_dict['embs'] = embs.numpy()
+            return self.save_dict
+        else:
+            return outputs
+
+    def forward(self, inputs: list):
+        embs = []
+        for x in inputs:
+            embs.append(self.model(x))
+        embs = torch.cat(embs)
+        return embs
+
+    def postprocess(self,
+                    inputs: torch.Tensor,
+                    in_audios: Union[np.ndarray, list],
+                    save_dir=None):
+        if isinstance(in_audios[0], str) and save_dir is not None:
+            # save the embeddings
+            os.makedirs(save_dir, exist_ok=True)
+            for i, p in enumerate(in_audios):
+                save_path = os.path.join(
+                    save_dir, '%s.npy' %
+                    (os.path.basename(p).rsplit('.', 1)[0]))
+                np.save(save_path, inputs[i].numpy())
+
+        if len(inputs) == 2:
+            # compute the score
+            score = self.compute_cos_similarity(inputs[0], inputs[1])
+            score = round(score, 5)
+            if score >= self.thr:
+                ans = 'yes'
+            else:
+                ans = 'no'
+            output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+        else:
+            output = {OutputKeys.TEXT: 'No similarity score output'}
+
+        return output
+
+    def preprocess(self, inputs: Union[np.ndarray, list]):
+        output = []
+        for i in range(len(inputs)):
+            if isinstance(inputs[i], str):
+                file_bytes = File.read(inputs[i])
+                data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+                if len(data.shape) == 2:
+                    data = data[:, 0]
+                data = torch.from_numpy(data).unsqueeze(0)
+                if fs != self.model_config['sample_rate']:
+                    logger.warning(
+                        'The sample rate of audio is not %d, resample it.'
+                        % self.model_config['sample_rate'])
+                    data, fs = torchaudio.sox_effects.apply_effects_tensor(
+                        data,
+                        fs,
+                        effects=[[
+                            'rate',
+                            str(self.model_config['sample_rate'])
+                        ]])
+                data = data.squeeze(0)
+            elif isinstance(inputs[i], np.ndarray):
+                assert len(
+                    inputs[i].shape
+                ) == 1, 'modelscope error: Input array should be [N, T]'
+                data = inputs[i]
+                if data.dtype in ['int16', 'int32', 'int64']:
+                    data = (data / (1 << 15)).astype('float32')
+                else:
+                    data = data.astype('float32')
+                data = torch.from_numpy(data)
+            else:
+                raise ValueError(
+                    'modelscope error: The input type is restricted to audio address and nump array.'
+                )
+            output.append(data)
+        return output
+
+    def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor],
+                               emb2: Union[np.ndarray, torch.Tensor]) -> float:
+        if isinstance(emb1, np.ndarray):
+            emb1 = torch.from_numpy(emb1)
+        if isinstance(emb2, np.ndarray):
+            emb2 = torch.from_numpy(emb2)
+        if len(emb1.shape):
+            emb1 = emb1.unsqueeze(0)
+        if len(emb2.shape):
+            emb2 = emb2.unsqueeze(0)
+        assert len(emb1.shape) == 2 and len(emb2.shape) == 2
+        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+        cosine = cos(emb1, emb2)
+        return cosine.item()
diff --git a/modelscope/pipelines/audio/speaker_verification_res2net_pipeline.py b/modelscope/pipelines/audio/speaker_verification_res2net_pipeline.py
new file mode 100644
index 000000000..308190601
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_res2net_pipeline.py
@@ -0,0 +1,160 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+import os
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import InputModel, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification,
+    module_name=Pipelines.speaker_verification_res2net)
+class Res2Net_Pipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the pipeline's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> p = pipeline(
+    >>>    task=Tasks.speaker_verification, model='iic/speech_res2net_sv_zh-cn_3dspeaker_16k')
+    >>> print(p([audio_1, audio_2]))
+
+    """
+
+    def __init__(self, model: InputModel, **kwargs):
+        """use `model` to create a speaker verification pipeline for prediction
+        Args:
+            model (str): a valid offical model id
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_config = self.model.model_config
+        self.config = self.model.other_config
+        self.thr = self.config['yesOrno_thr']
+        self.save_dict = {}
+
+    def __call__(self,
+                 in_audios: Union[np.ndarray, list],
+                 save_dir: str = None,
+                 output_emb: bool = False,
+                 thr: float = None):
+        if thr is not None:
+            self.thr = thr
+        if self.thr < -1 or self.thr > 1:
+            raise ValueError(
+                'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
+                % self.thr)
+        wavs = self.preprocess(in_audios)
+        embs = self.forward(wavs)
+        outputs = self.postprocess(embs, in_audios, save_dir)
+        if output_emb:
+            self.save_dict['outputs'] = outputs
+            self.save_dict['embs'] = embs.numpy()
+            return self.save_dict
+        else:
+            return outputs
+
+    def forward(self, inputs: list):
+        embs = []
+        for x in inputs:
+            embs.append(self.model(x))
+        embs = torch.cat(embs)
+        return embs
+
+    def postprocess(self,
+                    inputs: torch.Tensor,
+                    in_audios: Union[np.ndarray, list],
+                    save_dir=None):
+        if isinstance(in_audios[0], str) and save_dir is not None:
+            # save the embeddings
+            os.makedirs(save_dir, exist_ok=True)
+            for i, p in enumerate(in_audios):
+                save_path = os.path.join(
+                    save_dir, '%s.npy' %
+                    (os.path.basename(p).rsplit('.', 1)[0]))
+                np.save(save_path, inputs[i].numpy())
+
+        if len(inputs) == 2:
+            # compute the score
+            score = self.compute_cos_similarity(inputs[0], inputs[1])
+            score = round(score, 5)
+            if score >= self.thr:
+                ans = 'yes'
+            else:
+                ans = 'no'
+            output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+        else:
+            output = {OutputKeys.TEXT: 'No similarity score output'}
+
+        return output
+
+    def preprocess(self, inputs: Union[np.ndarray, list]):
+        output = []
+        for i in range(len(inputs)):
+            if isinstance(inputs[i], str):
+                file_bytes = File.read(inputs[i])
+                data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+                if len(data.shape) == 2:
+                    data = data[:, 0]
+                data = torch.from_numpy(data).unsqueeze(0)
+                if fs != self.model_config['sample_rate']:
+                    logger.warning(
+                        'The sample rate of audio is not %d, resample it.'
+                        % self.model_config['sample_rate'])
+                    data, fs = torchaudio.sox_effects.apply_effects_tensor(
+                        data,
+                        fs,
+                        effects=[[
+                            'rate',
+                            str(self.model_config['sample_rate'])
+                        ]])
+                data = data.squeeze(0)
+            elif isinstance(inputs[i], np.ndarray):
+                assert len(
+                    inputs[i].shape
+                ) == 1, 'modelscope error: Input array should be [N, T]'
+                data = inputs[i]
+                if data.dtype in ['int16', 'int32', 'int64']:
+                    data = (data / (1 << 15)).astype('float32')
+                else:
+                    data = data.astype('float32')
+                data = torch.from_numpy(data)
+            else:
+                raise ValueError(
+                    'modelscope error: The input type is restricted to audio address and nump array.'
+                )
+            output.append(data)
+        return output
+
+    def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor],
+                               emb2: Union[np.ndarray, torch.Tensor]) -> float:
+        if isinstance(emb1, np.ndarray):
+            emb1 = torch.from_numpy(emb1)
+        if isinstance(emb2, np.ndarray):
+            emb2 = torch.from_numpy(emb2)
+        if len(emb1.shape):
+            emb1 = emb1.unsqueeze(0)
+        if len(emb2.shape):
+            emb2 = emb2.unsqueeze(0)
+        assert len(emb1.shape) == 2 and len(emb2.shape) == 2
+        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+        cosine = cos(emb1, emb2)
+        return cosine.item()
diff --git a/modelscope/pipelines/audio/speaker_verification_resnet_pipeline.py b/modelscope/pipelines/audio/speaker_verification_resnet_pipeline.py
new file mode 100644
index 000000000..8b2b59dba
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_resnet_pipeline.py
@@ -0,0 +1,160 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+import os
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import InputModel, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification,
+    module_name=Pipelines.speaker_verification_resnet)
+class ResNet_Pipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the pipeline's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> p = pipeline(
+    >>>    task=Tasks.speaker_verification, model='iic/speech_resnet34_sv_zh-cn_3dspeaker_16k')
+    >>> print(p([audio_1, audio_2]))
+
+    """
+
+    def __init__(self, model: InputModel, **kwargs):
+        """use `model` to create a speaker verification pipeline for prediction
+        Args:
+            model (str): a valid offical model id
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_config = self.model.model_config
+        self.config = self.model.other_config
+        self.thr = self.config['yesOrno_thr']
+        self.save_dict = {}
+
+    def __call__(self,
+                 in_audios: Union[np.ndarray, list],
+                 save_dir: str = None,
+                 output_emb: bool = False,
+                 thr: float = None):
+        if thr is not None:
+            self.thr = thr
+        if self.thr < -1 or self.thr > 1:
+            raise ValueError(
+                'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
+                % self.thr)
+        wavs = self.preprocess(in_audios)
+        embs = self.forward(wavs)
+        outputs = self.postprocess(embs, in_audios, save_dir)
+        if output_emb:
+            self.save_dict['outputs'] = outputs
+            self.save_dict['embs'] = embs.numpy()
+            return self.save_dict
+        else:
+            return outputs
+
+    def forward(self, inputs: list):
+        embs = []
+        for x in inputs:
+            embs.append(self.model(x))
+        embs = torch.cat(embs)
+        return embs
+
+    def postprocess(self,
+                    inputs: torch.Tensor,
+                    in_audios: Union[np.ndarray, list],
+                    save_dir=None):
+        if isinstance(in_audios[0], str) and save_dir is not None:
+            # save the embeddings
+            os.makedirs(save_dir, exist_ok=True)
+            for i, p in enumerate(in_audios):
+                save_path = os.path.join(
+                    save_dir, '%s.npy' %
+                    (os.path.basename(p).rsplit('.', 1)[0]))
+                np.save(save_path, inputs[i].numpy())
+
+        if len(inputs) == 2:
+            # compute the score
+            score = self.compute_cos_similarity(inputs[0], inputs[1])
+            score = round(score, 5)
+            if score >= self.thr:
+                ans = 'yes'
+            else:
+                ans = 'no'
+            output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+        else:
+            output = {OutputKeys.TEXT: 'No similarity score output'}
+
+        return output
+
+    def preprocess(self, inputs: Union[np.ndarray, list]):
+        output = []
+        for i in range(len(inputs)):
+            if isinstance(inputs[i], str):
+                file_bytes = File.read(inputs[i])
+                data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+                if len(data.shape) == 2:
+                    data = data[:, 0]
+                data = torch.from_numpy(data).unsqueeze(0)
+                if fs != self.model_config['sample_rate']:
+                    logger.warning(
+                        'The sample rate of audio is not %d, resample it.'
+                        % self.model_config['sample_rate'])
+                    data, fs = torchaudio.sox_effects.apply_effects_tensor(
+                        data,
+                        fs,
+                        effects=[[
+                            'rate',
+                            str(self.model_config['sample_rate'])
+                        ]])
+                data = data.squeeze(0)
+            elif isinstance(inputs[i], np.ndarray):
+                assert len(
+                    inputs[i].shape
+                ) == 1, 'modelscope error: Input array should be [N, T]'
+                data = inputs[i]
+                if data.dtype in ['int16', 'int32', 'int64']:
+                    data = (data / (1 << 15)).astype('float32')
+                else:
+                    data = data.astype('float32')
+                data = torch.from_numpy(data)
+            else:
+                raise ValueError(
+                    'modelscope error: The input type is restricted to audio address and nump array.'
+                )
+            output.append(data)
+        return output
+
+    def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor],
+                               emb2: Union[np.ndarray, torch.Tensor]) -> float:
+        if isinstance(emb1, np.ndarray):
+            emb1 = torch.from_numpy(emb1)
+        if isinstance(emb2, np.ndarray):
+            emb2 = torch.from_numpy(emb2)
+        if len(emb1.shape):
+            emb1 = emb1.unsqueeze(0)
+        if len(emb2.shape):
+            emb2 = emb2.unsqueeze(0)
+        assert len(emb1.shape) == 2 and len(emb2.shape) == 2
+        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+        cosine = cos(emb1, emb2)
+        return cosine.item()
diff --git a/modelscope/pipelines/audio/speaker_verification_sdpn_pipeline.py b/modelscope/pipelines/audio/speaker_verification_sdpn_pipeline.py
new file mode 100644
index 000000000..352d448ba
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_sdpn_pipeline.py
@@ -0,0 +1,110 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+from typing import Any, Dict, List, Union
+
+import soundfile as sf
+import torch
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import InputModel, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification,
+    module_name=Pipelines.speaker_verification_sdpn)
+class SDPNPipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the pipeline's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> p = pipeline(
+    >>>    task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k')
+    >>> print(p([audio_1, audio_2]))
+
+    """
+
+    def __init__(self, model: InputModel, **kwargs):
+        """use `model` to create a speaker verification pipeline for prediction
+        Args:
+            model (str): a valid offical model id
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_config = self.model.model_config
+        self.config = self.model.other_config
+        self.thr = self.config['yesOrno_thr']
+
+    def __call__(self,
+                 in_audios: List[str],
+                 thr: float = None) -> Dict[str, Any]:
+        if thr is not None:
+            self.thr = thr
+        if self.thr < -1 or self.thr > 1:
+            raise ValueError(
+                'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
+                % self.thr)
+        outputs = self.preprocess(in_audios)
+        outputs = self.forward(outputs)
+        outputs = self.postprocess(outputs)
+
+        return outputs
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        emb1 = self.model(inputs['data1'])
+        emb2 = self.model(inputs['data2'])
+
+        return {'emb1': emb1, 'emb2': emb2}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        score = self.compute_cos_similarity(inputs['emb1'], inputs['emb2'])
+        score = round(score, 5)
+        if score >= self.thr:
+            ans = 'yes'
+        else:
+            ans = 'no'
+
+        return {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+
+    def preprocess(self, inputs: List[str],
+                   **preprocess_params) -> Dict[str, Any]:
+        if len(inputs) != 2:
+            raise ValueError(
+                'modelscope error: Two input audio files are required.')
+        output = {}
+        for i in range(len(inputs)):
+            if isinstance(inputs[i], str):
+                file_bytes = File.read(inputs[i])
+                data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+                if len(data.shape) == 2:
+                    data = data[:, 0]
+                if fs != self.model_config['sample_rate']:
+                    raise ValueError(
+                        'modelscope error: Only support %d sample rate files'
+                        % self.model_cfg['sample_rate'])
+                output['data%d' %
+                       (i + 1)] = torch.from_numpy(data).unsqueeze(0)
+            else:
+                raise ValueError(
+                    'modelscope error: The input type is temporarily restricted to audio file address'
+                    % i)
+        return output
+
+    def compute_cos_similarity(self, emb1: torch.Tensor,
+                               emb2: torch.Tensor) -> float:
+        assert len(emb1.shape) == 2 and len(emb2.shape) == 2
+        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+        cosine = cos(emb1, emb2)
+        return cosine.item()
diff --git a/modelscope/pipelines/audio/speaker_verification_tdnn_pipeline.py b/modelscope/pipelines/audio/speaker_verification_tdnn_pipeline.py
new file mode 100644
index 000000000..4c8a6f321
--- /dev/null
+++ b/modelscope/pipelines/audio/speaker_verification_tdnn_pipeline.py
@@ -0,0 +1,160 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import io
+import os
+from typing import Any, Dict, List, Union
+
+import numpy as np
+import soundfile as sf
+import torch
+import torchaudio
+
+from modelscope.fileio import File
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import InputModel, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.speaker_verification,
+    module_name=Pipelines.speaker_verification_tdnn)
+class SpeakerVerificationTDNNPipeline(Pipeline):
+    """Speaker Verification Inference Pipeline
+    use `model` to create a Speaker Verification pipeline.
+
+    Args:
+        model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub.
+        kwargs (dict, `optional`):
+            Extra kwargs passed into the pipeline's constructor.
+    Example:
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> p = pipeline(
+    >>>    task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k')
+    >>> print(p([audio_1, audio_2]))
+
+    """
+
+    def __init__(self, model: InputModel, **kwargs):
+        """use `model` to create a speaker verification pipeline for prediction
+        Args:
+            model (str): a valid offical model id
+        """
+        super().__init__(model=model, **kwargs)
+        self.model_config = self.model.model_config
+        self.config = self.model.other_config
+        self.thr = self.config['yesOrno_thr']
+        self.save_dict = {}
+
+    def __call__(self,
+                 in_audios: Union[np.ndarray, list],
+                 save_dir: str = None,
+                 output_emb: bool = False,
+                 thr: float = None):
+        if thr is not None:
+            self.thr = thr
+        if self.thr < -1 or self.thr > 1:
+            raise ValueError(
+                'modelscope error: the thr value should be in [-1, 1], but found to be %f.'
+                % self.thr)
+        wavs = self.preprocess(in_audios)
+        embs = self.forward(wavs)
+        outputs = self.postprocess(embs, in_audios, save_dir)
+        if output_emb:
+            self.save_dict['outputs'] = outputs
+            self.save_dict['embs'] = embs.numpy()
+            return self.save_dict
+        else:
+            return outputs
+
+    def forward(self, inputs: list):
+        embs = []
+        for x in inputs:
+            embs.append(self.model(x))
+        embs = torch.cat(embs)
+        return embs
+
+    def postprocess(self,
+                    inputs: torch.Tensor,
+                    in_audios: Union[np.ndarray, list],
+                    save_dir=None):
+        if isinstance(in_audios[0], str) and save_dir is not None:
+            # save the embeddings
+            os.makedirs(save_dir, exist_ok=True)
+            for i, p in enumerate(in_audios):
+                save_path = os.path.join(
+                    save_dir, '%s.npy' %
+                    (os.path.basename(p).rsplit('.', 1)[0]))
+                np.save(save_path, inputs[i].numpy())
+
+        if len(inputs) == 2:
+            # compute the score
+            score = self.compute_cos_similarity(inputs[0], inputs[1])
+            score = round(score, 5)
+            if score >= self.thr:
+                ans = 'yes'
+            else:
+                ans = 'no'
+            output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans}
+        else:
+            output = {OutputKeys.TEXT: 'No similarity score output'}
+
+        return output
+
+    def preprocess(self, inputs: Union[np.ndarray, list]):
+        output = []
+        for i in range(len(inputs)):
+            if isinstance(inputs[i], str):
+                file_bytes = File.read(inputs[i])
+                data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32')
+                if len(data.shape) == 2:
+                    data = data[:, 0]
+                data = torch.from_numpy(data).unsqueeze(0)
+                if fs != self.model_config['sample_rate']:
+                    logger.warning(
+                        'The sample rate of audio is not %d, resample it.'
+                        % self.model_config['sample_rate'])
+                    data, fs = torchaudio.sox_effects.apply_effects_tensor(
+                        data,
+                        fs,
+                        effects=[[
+                            'rate',
+                            str(self.model_config['sample_rate'])
+                        ]])
+                data = data.squeeze(0)
+            elif isinstance(inputs[i], np.ndarray):
+                assert len(
+                    inputs[i].shape
+                ) == 1, 'modelscope error: Input array should be [N, T]'
+                data = inputs[i]
+                if data.dtype in ['int16', 'int32', 'int64']:
+                    data = (data / (1 << 15)).astype('float32')
+                else:
+                    data = data.astype('float32')
+                data = torch.from_numpy(data)
+            else:
+                raise ValueError(
+                    'modelscope error: The input type is restricted to audio address and nump array.'
+                )
+            output.append(data)
+        return output
+
+    def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor],
+                               emb2: Union[np.ndarray, torch.Tensor]) -> float:
+        if isinstance(emb1, np.ndarray):
+            emb1 = torch.from_numpy(emb1)
+        if isinstance(emb2, np.ndarray):
+            emb2 = torch.from_numpy(emb2)
+        if len(emb1.shape):
+            emb1 = emb1.unsqueeze(0)
+        if len(emb2.shape):
+            emb2 = emb2.unsqueeze(0)
+        assert len(emb1.shape) == 2 and len(emb2.shape) == 2
+        cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6)
+        cosine = cos(emb1, emb2)
+        return cosine.item()
diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py
index 4cfa9379e..17ce054f3 100644
--- a/modelscope/pipelines/audio/text_to_speech_pipeline.py
+++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py
@@ -1,16 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-from typing import Any, Dict, List
+from typing import Any, Dict
 
 import numpy as np
 
 from modelscope.metainfo import Pipelines
-from modelscope.models import Model
 from modelscope.models.audio.tts import SambertHifigan
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, InputModel, Pipeline
 from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.constant import Fields, Tasks
+from modelscope.utils.constant import Tasks
 
 __all__ = ['TextToSpeechSambertHifiganPipeline']
 
diff --git a/modelscope/pipelines/audio/timestamp_pipeline.py b/modelscope/pipelines/audio/timestamp_pipeline.py
deleted file mode 100644
index 98e9eb05f..000000000
--- a/modelscope/pipelines/audio/timestamp_pipeline.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict, List, Sequence, Tuple, Union
-
-import json
-import yaml
-from funasr.utils import asr_utils
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
-                                                update_local_model)
-from modelscope.utils.constant import Frameworks, ModelFile, Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['TimestampPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.speech_timestamp, module_name=Pipelines.speech_timestamp_inference)
-class TimestampPipeline(Pipeline):
-    """Timestamp Inference Pipeline
-    Example:
-
-    >>> from modelscope.pipelines import pipeline
-    >>> from modelscope.utils.constant import Tasks
-
-    >>> pipeline_infer = pipeline(
-    >>>    task=Tasks.speech_timestamp,
-    >>>    model='damo/speech_timestamp_predictor-v1-16k-offline')
-
-    >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav'
-    >>> text_in='一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢'
-    >>> print(pipeline_infer(audio_in, text_in))
-
-    """
-
-    def __init__(self,
-                 model: Union[Model, str] = None,
-                 ngpu: int = 1,
-                 **kwargs):
-        """
-        Use `model` and `preprocessor` to create an asr pipeline for prediction
-        Args:
-            model ('Model' or 'str'):
-                The pipeline handles three types of model:
-
-                - A model instance
-                - A model local dir
-                - A model id in the model hub
-            output_dir('str'):
-                output dir path
-            batch_size('int'):
-                the batch size for inference
-            ngpu('int'):
-                the number of gpus, 0 indicates CPU mode
-            split_with_space('bool'):
-                split the input sentence by space
-            seg_dict_file('str'):
-                seg dict file
-            param_dict('dict'):
-                extra kwargs
-        """
-        super().__init__(model=model, **kwargs)
-        config_path = os.path.join(model, ModelFile.CONFIGURATION)
-        self.cmd = self.get_cmd(config_path, kwargs, model)
-
-        from funasr.bin import tp_inference_launch
-        self.funasr_infer_modelscope = tp_inference_launch.inference_launch(
-            mode=self.cmd['mode'],
-            batch_size=self.cmd['batch_size'],
-            dtype=self.cmd['dtype'],
-            ngpu=ngpu,
-            seed=self.cmd['seed'],
-            num_workers=self.cmd['num_workers'],
-            log_level=self.cmd['log_level'],
-            key_file=self.cmd['key_file'],
-            timestamp_infer_config=self.cmd['timestamp_infer_config'],
-            timestamp_model_file=self.cmd['timestamp_model_file'],
-            timestamp_cmvn_file=self.cmd['timestamp_cmvn_file'],
-            output_dir=self.cmd['output_dir'],
-            allow_variable_data_keys=self.cmd['allow_variable_data_keys'],
-            split_with_space=self.cmd['split_with_space'],
-            seg_dict_file=self.cmd['seg_dict_file'],
-            param_dict=self.cmd['param_dict'],
-            **kwargs,
-        )
-
-    def __call__(self,
-                 audio_in: Union[str, bytes],
-                 text_in: str,
-                 audio_fs: int = None,
-                 recog_type: str = None,
-                 audio_format: str = None,
-                 output_dir: str = None,
-                 param_dict: dict = None,
-                 **kwargs) -> Dict[str, Any]:
-        """
-        Decoding the input audios
-        Args:
-            audio_in('str' or 'bytes'):
-                - A string containing a local path to a wav file
-                - A string containing a local path to a scp
-                - A string containing a wav url
-            text_in('str'):
-                - A text str input
-                - A local text file input endswith .txt or .scp
-            audio_fs('int'):
-                frequency of sample
-            recog_type('str'):
-                recog type for wav file or datasets file ('wav', 'test', 'dev', 'train')
-            audio_format('str'):
-                audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord')
-            output_dir('str'):
-                output dir
-            param_dict('dict'):
-                extra kwargs
-        Return:
-            A dictionary of result or a list of dictionary of result.
-
-            The dictionary contain the following keys:
-            - **text** ('str') --The timestamp result.
-        """
-        self.audio_in = None
-        self.text_in = None
-        self.raw_inputs = None
-        self.recog_type = recog_type
-        self.audio_format = audio_format
-        self.audio_fs = None
-        checking_audio_fs = None
-        if output_dir is not None:
-            self.cmd['output_dir'] = output_dir
-        if param_dict is not None:
-            self.cmd['param_dict'] = param_dict
-
-        # audio
-        if isinstance(audio_in, str):
-            # for funasr code, generate wav.scp from url or local path
-            self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in)
-        elif isinstance(audio_in, bytes):
-            self.audio_in = audio_in
-            self.raw_inputs = None
-        else:
-            import numpy
-            import torch
-            if isinstance(audio_in, torch.Tensor):
-                self.audio_in = None
-                self.raw_inputs = audio_in
-            elif isinstance(audio_in, numpy.ndarray):
-                self.audio_in = None
-                self.raw_inputs = audio_in
-        # text
-        if text_in.startswith('http'):
-            self.text_in, _ = generate_text_from_url(text_in)
-        else:
-            self.text_in = text_in
-
-        # set the sample_rate of audio_in if checking_audio_fs is valid
-        if checking_audio_fs is not None:
-            self.audio_fs = checking_audio_fs
-
-        if recog_type is None or audio_format is None:
-            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=self.audio_in,
-                recog_type=recog_type,
-                audio_format=audio_format)
-
-        if hasattr(asr_utils,
-                   'sample_rate_checking') and self.audio_in is not None:
-            checking_audio_fs = asr_utils.sample_rate_checking(
-                self.audio_in, self.audio_format)
-            if checking_audio_fs is not None:
-                self.audio_fs = checking_audio_fs
-        if audio_fs is not None:
-            self.cmd['fs']['audio_fs'] = audio_fs
-        else:
-            self.cmd['fs']['audio_fs'] = self.audio_fs
-
-        output = self.forward(self.audio_in, self.text_in, **kwargs)
-        result = self.postprocess(output)
-        return result
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        """Postprocessing
-        """
-        rst = {}
-        for i in range(len(inputs)):
-            if i == 0:
-                for key, value in inputs[0].items():
-                    if key == 'value':
-                        if len(value) > 0:
-                            rst[OutputKeys.TEXT] = value
-                    elif key != 'key':
-                        rst[key] = value
-            else:
-                rst[inputs[i]['key']] = inputs[i]['value']
-        return rst
-
-    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
-        model_cfg = json.loads(open(config_path).read())
-        model_dir = os.path.dirname(config_path)
-        # generate inference command
-        timestamp_model_file = os.path.join(
-            model_dir,
-            model_cfg['model']['model_config']['timestamp_model_file'])
-        timestamp_infer_config = os.path.join(
-            model_dir,
-            model_cfg['model']['model_config']['timestamp_infer_config'])
-        timestamp_cmvn_file = os.path.join(
-            model_dir,
-            model_cfg['model']['model_config']['timestamp_cmvn_file'])
-        mode = model_cfg['model']['model_config']['mode']
-        frontend_conf = None
-        if os.path.exists(timestamp_infer_config):
-            config_file = open(timestamp_infer_config, encoding='utf-8')
-            root = yaml.full_load(config_file)
-            config_file.close()
-            if 'frontend_conf' in root:
-                frontend_conf = root['frontend_conf']
-        seg_dict_file = None
-        if 'seg_dict_file' in model_cfg['model']['model_config']:
-            seg_dict_file = os.path.join(
-                model_dir, model_cfg['model']['model_config']['seg_dict_file'])
-        update_local_model(model_cfg['model']['model_config'], model_path,
-                           extra_args)
-
-        cmd = {
-            'mode': mode,
-            'batch_size': 1,
-            'dtype': 'float32',
-            'ngpu': 0,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'seed': 0,
-            'num_workers': 0,
-            'log_level': 'ERROR',
-            'key_file': None,
-            'allow_variable_data_keys': False,
-            'split_with_space': True,
-            'seg_dict_file': seg_dict_file,
-            'timestamp_infer_config': timestamp_infer_config,
-            'timestamp_model_file': timestamp_model_file,
-            'timestamp_cmvn_file': timestamp_cmvn_file,
-            'output_dir': None,
-            'param_dict': None,
-            'fs': {
-                'model_fs': None,
-                'audio_fs': None
-            }
-        }
-        if frontend_conf is not None and 'fs' in frontend_conf:
-            cmd['fs']['model_fs'] = frontend_conf['fs']
-
-        user_args_dict = [
-            'output_dir',
-            'batch_size',
-            'mode',
-            'ngpu',
-            'param_dict',
-            'num_workers',
-            'log_level',
-            'split_with_space',
-            'seg_dict_file',
-        ]
-
-        for user_args in user_args_dict:
-            if user_args in extra_args:
-                if extra_args.get(user_args) is not None:
-                    cmd[user_args] = extra_args[user_args]
-                del extra_args[user_args]
-
-        return cmd
-
-    def forward(self, audio_in: Dict[str, Any], text_in: Dict[str, Any],
-                **kwargs) -> Dict[str, Any]:
-        """Decoding
-        """
-        logger.info('Timestamp Processing ...')
-        # generate inputs
-        data_cmd: Sequence[Tuple[str, str, str]]
-        if isinstance(self.audio_in, bytes):
-            data_cmd = [(self.audio_in, 'speech', 'bytes')]
-            data_cmd.append((text_in, 'text', 'text'))
-        elif isinstance(self.audio_in, str):
-            data_cmd = [(self.audio_in, 'speech', 'sound')]
-            data_cmd.append((text_in, 'text', 'text'))
-        elif self.raw_inputs is not None:
-            data_cmd = None
-
-        if self.raw_inputs is None and data_cmd is None:
-            raise ValueError('please check audio_in')
-
-        self.cmd['name_and_type'] = data_cmd
-        self.cmd['raw_inputs'] = self.raw_inputs
-        self.cmd['audio_in'] = self.audio_in
-
-        tp_result = self.run_inference(self.cmd, **kwargs)
-
-        return tp_result
-
-    def run_inference(self, cmd, **kwargs):
-        tp_result = []
-        if self.framework == Frameworks.torch:
-            tp_result = self.funasr_infer_modelscope(
-                data_path_and_name_and_type=cmd['name_and_type'],
-                raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'],
-                fs=cmd['fs'],
-                param_dict=cmd['param_dict'],
-                **kwargs)
-        else:
-            raise ValueError('model type is mismatching')
-
-        return tp_result
diff --git a/modelscope/pipelines/audio/voice_activity_detection_pipeline.py b/modelscope/pipelines/audio/voice_activity_detection_pipeline.py
deleted file mode 100644
index 3e00454a9..000000000
--- a/modelscope/pipelines/audio/voice_activity_detection_pipeline.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
-import os
-from typing import Any, Dict, List, Sequence, Tuple, Union
-
-import json
-import yaml
-from funasr.utils import asr_utils
-
-from modelscope.metainfo import Pipelines
-from modelscope.models import Model
-from modelscope.outputs import OutputKeys
-from modelscope.pipelines.base import Pipeline
-from modelscope.pipelines.builder import PIPELINES
-from modelscope.utils.audio.audio_utils import (generate_scp_from_url,
-                                                update_local_model)
-from modelscope.utils.constant import Frameworks, ModelFile, Tasks
-from modelscope.utils.logger import get_logger
-
-logger = get_logger()
-
-__all__ = ['VoiceActivityDetectionPipeline']
-
-
-@PIPELINES.register_module(
-    Tasks.voice_activity_detection, module_name=Pipelines.vad_inference)
-class VoiceActivityDetectionPipeline(Pipeline):
-    """Voice Activity Detection Inference Pipeline
-    use `model` to create a Voice Activity Detection pipeline.
-
-    Args:
-        model: A model instance, or a model local dir, or a model id in the model hub.
-        kwargs (dict, `optional`):
-            Extra kwargs passed into the preprocessor's constructor.
-
-    Example:
-        >>> from modelscope.pipelines import pipeline
-        >>> pipeline_vad = pipeline(
-        >>>    task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch')
-        >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm'
-        >>> print(pipeline_vad(audio_in))
-
-    """
-
-    def __init__(self,
-                 model: Union[Model, str] = None,
-                 ngpu: int = 1,
-                 **kwargs):
-        """use `model` to create an vad pipeline for prediction
-        """
-        super().__init__(model=model, **kwargs)
-        config_path = os.path.join(model, ModelFile.CONFIGURATION)
-        self.cmd = self.get_cmd(config_path, kwargs, model)
-
-        from funasr.bin import vad_inference_launch
-        self.funasr_infer_modelscope = vad_inference_launch.inference_launch(
-            mode=self.cmd['mode'],
-            batch_size=self.cmd['batch_size'],
-            dtype=self.cmd['dtype'],
-            ngpu=ngpu,
-            seed=self.cmd['seed'],
-            num_workers=self.cmd['num_workers'],
-            log_level=self.cmd['log_level'],
-            key_file=self.cmd['key_file'],
-            vad_infer_config=self.cmd['vad_infer_config'],
-            vad_model_file=self.cmd['vad_model_file'],
-            vad_cmvn_file=self.cmd['vad_cmvn_file'],
-            **kwargs,
-        )
-
-    def __call__(self,
-                 audio_in: Union[str, bytes],
-                 audio_fs: int = None,
-                 recog_type: str = None,
-                 audio_format: str = None,
-                 output_dir: str = None,
-                 param_dict: dict = None,
-                 **kwargs) -> Dict[str, Any]:
-        """
-        Decoding the input audios
-        Args:
-            audio_in('str' or 'bytes'):
-                - A string containing a local path to a wav file
-                - A string containing a local path to a scp
-                - A string containing a wav url
-                - A bytes input
-            audio_fs('int'):
-                frequency of sample
-            recog_type('str'):
-                recog type for wav file or datasets file ('wav', 'test', 'dev', 'train')
-            audio_format('str'):
-                audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord')
-            output_dir('str'):
-                output dir
-            param_dict('dict'):
-                extra kwargs
-        Return:
-            A dictionary of result or a list of dictionary of result.
-
-            The dictionary contain the following keys:
-            - **text** ('str') --The vad result.
-        """
-        self.audio_in = None
-        self.raw_inputs = None
-        self.recog_type = recog_type
-        self.audio_format = audio_format
-        self.audio_fs = None
-        checking_audio_fs = None
-        if output_dir is not None:
-            self.cmd['output_dir'] = output_dir
-        if param_dict is not None:
-            self.cmd['param_dict'] = param_dict
-        if isinstance(audio_in, str):
-            # for funasr code, generate wav.scp from url or local path
-            self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in)
-        elif isinstance(audio_in, bytes):
-            self.audio_in = audio_in
-            self.raw_inputs = None
-        else:
-            import numpy
-            import torch
-            if isinstance(audio_in, torch.Tensor):
-                self.audio_in = None
-                self.raw_inputs = audio_in
-            elif isinstance(audio_in, numpy.ndarray):
-                self.audio_in = None
-                self.raw_inputs = audio_in
-
-        # set the sample_rate of audio_in if checking_audio_fs is valid
-        if checking_audio_fs is not None:
-            self.audio_fs = checking_audio_fs
-
-        if recog_type is None or audio_format is None:
-            self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking(
-                audio_in=self.audio_in,
-                recog_type=recog_type,
-                audio_format=audio_format)
-
-        if hasattr(asr_utils,
-                   'sample_rate_checking') and self.audio_in is not None:
-            checking_audio_fs = asr_utils.sample_rate_checking(
-                self.audio_in, self.audio_format)
-            if checking_audio_fs is not None:
-                self.audio_fs = checking_audio_fs
-        if audio_fs is not None:
-            self.cmd['fs']['audio_fs'] = audio_fs
-        else:
-            self.cmd['fs']['audio_fs'] = self.audio_fs
-
-        output = self.forward(self.audio_in, **kwargs)
-        result = self.postprocess(output)
-        return result
-
-    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        """Postprocessing
-        """
-        rst = {}
-        for i in range(len(inputs)):
-            if i == 0:
-                text = inputs[0]['value']
-                if len(text) > 0:
-                    rst[OutputKeys.TEXT] = text
-            else:
-                rst[inputs[i]['key']] = inputs[i]['value']
-        return rst
-
-    def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]:
-        model_cfg = json.loads(open(config_path).read())
-        model_dir = os.path.dirname(config_path)
-        # generate inference command
-        vad_model_path = os.path.join(
-            model_dir, model_cfg['model']['model_config']['vad_model_name'])
-        vad_model_config = os.path.join(
-            model_dir, model_cfg['model']['model_config']['vad_model_config'])
-        vad_cmvn_file = os.path.join(
-            model_dir, model_cfg['model']['model_config']['vad_mvn_file'])
-        mode = model_cfg['model']['model_config']['mode']
-        frontend_conf = None
-        if os.path.exists(vad_model_config):
-            config_file = open(vad_model_config, encoding='utf-8')
-            root = yaml.full_load(config_file)
-            config_file.close()
-            if 'frontend_conf' in root:
-                frontend_conf = root['frontend_conf']
-        update_local_model(model_cfg['model']['model_config'], model_path,
-                           extra_args)
-
-        cmd = {
-            'mode': mode,
-            'batch_size': 1,
-            'dtype': 'float32',
-            'ngpu': 1,  # 0: only CPU, ngpu>=1: gpu number if cuda is available
-            'seed': 0,
-            'num_workers': 0,
-            'log_level': 'ERROR',
-            'key_file': None,
-            'vad_infer_config': vad_model_config,
-            'vad_model_file': vad_model_path,
-            'vad_cmvn_file': vad_cmvn_file,
-            'output_dir': None,
-            'param_dict': None,
-            'fs': {
-                'model_fs': None,
-                'audio_fs': None
-            }
-        }
-        if frontend_conf is not None and 'fs' in frontend_conf:
-            cmd['fs']['model_fs'] = frontend_conf['fs']
-
-        user_args_dict = [
-            'output_dir', 'batch_size', 'mode', 'ngpu', 'param_dict',
-            'num_workers', 'fs'
-        ]
-
-        for user_args in user_args_dict:
-            if user_args in extra_args:
-                if extra_args.get(user_args) is not None:
-                    cmd[user_args] = extra_args[user_args]
-                del extra_args[user_args]
-
-        return cmd
-
-    def forward(self, audio_in: Dict[str, Any], **kwargs) -> Dict[str, Any]:
-        """Decoding
-        """
-        logger.info('VAD Processing ...')
-        # generate inputs
-        data_cmd: Sequence[Tuple[str, str, str]]
-        if isinstance(self.audio_in, bytes):
-            data_cmd = [self.audio_in, 'speech', 'bytes']
-        elif isinstance(self.audio_in, str):
-            data_cmd = [self.audio_in, 'speech', 'sound']
-        elif self.raw_inputs is not None:
-            data_cmd = None
-        self.cmd['name_and_type'] = data_cmd
-        self.cmd['raw_inputs'] = self.raw_inputs
-        self.cmd['audio_in'] = self.audio_in
-
-        vad_result = self.run_inference(self.cmd, **kwargs)
-
-        return vad_result
-
-    def run_inference(self, cmd, **kwargs):
-        vad_result = []
-        if self.framework == Frameworks.torch:
-            vad_result = self.funasr_infer_modelscope(
-                data_path_and_name_and_type=cmd['name_and_type'],
-                raw_inputs=cmd['raw_inputs'],
-                output_dir_v2=cmd['output_dir'],
-                fs=cmd['fs'],
-                param_dict=cmd['param_dict'],
-                **kwargs)
-        else:
-            raise ValueError('model type is mismatching')
-
-        return vad_result
diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py
index 4869e5c70..91c5b5543 100644
--- a/modelscope/pipelines/base.py
+++ b/modelscope/pipelines/base.py
@@ -44,7 +44,7 @@ class Pipeline(ABC):
     """Pipeline base.
     """
 
-    def initiate_single_model(self, model):
+    def initiate_single_model(self, model, **kwargs):
         if isinstance(model, str):
             logger.info(f'initiate model from {model}')
         if isinstance(model, str) and is_official_hub_path(model):
@@ -55,7 +55,8 @@ def initiate_single_model(self, model):
                 device=self.device_name,
                 model_prefetched=True,
                 invoked_by=Invoke.PIPELINE,
-                device_map=self.device_map) if is_model(model) else model
+                device_map=self.device_map,
+                **kwargs) if is_model(model) else model
         else:
             return model
 
@@ -96,7 +97,7 @@ def __init__(self,
         self.device_name = device
 
         if not isinstance(model, List):
-            self.model = self.initiate_single_model(model)
+            self.model = self.initiate_single_model(model, **kwargs)
             self.models = [self.model]
         else:
             self.model = None
@@ -204,6 +205,13 @@ def __call__(self, input: Union[Input, List[Input]], *args,
         kwargs['preprocess_params'] = preprocess_params
         kwargs['forward_params'] = forward_params
         kwargs['postprocess_params'] = postprocess_params
+
+        # for LLMPipeline, we shall support treating list of roles as a
+        # one single 'messages' input
+        if 'LLMPipeline' in type(self).__name__ and isinstance(input, list):
+            input = {'messages': input}
+            kwargs['is_message'] = True
+
         if isinstance(input, list):
             if batch_size is None:
                 output = []
@@ -396,7 +404,6 @@ def forward(self, inputs: Dict[str, Any],
         assert not self.has_multiple_models, 'default implementation does not support multiple models in a pipeline.'
         return self.model(inputs, **forward_params)
 
-    @abstractmethod
     def postprocess(self, inputs: Dict[str, Any],
                     **post_params) -> Dict[str, Any]:
         """ If current pipeline support model reuse, common postprocess
@@ -481,7 +488,10 @@ def __init__(self,
 
     def __del__(self):
         if hasattr(self, 'model_pool') and self.model_pool is not None:
-            self.model_pool.terminate()
+            try:
+                self.model_pool.terminate()
+            except AttributeError:
+                pass
 
     def __getstate__(self):
         self_dict = self.__dict__.copy()
diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py
index f44f73811..665318073 100644
--- a/modelscope/pipelines/builder.py
+++ b/modelscope/pipelines/builder.py
@@ -1,13 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
 import os
-from typing import List, Optional, Union
+from typing import Any, Dict, List, Optional, Union
 
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.metainfo import DEFAULT_MODEL_FOR_PIPELINE
 from modelscope.models.base import Model
 from modelscope.utils.config import ConfigDict, check_config
-from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke,
+from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke, Tasks,
                                        ThirdParty)
 from modelscope.utils.hub import read_config
 from modelscope.utils.plugins import (register_modelhub_repo,
@@ -108,18 +108,29 @@ def pipeline(task: str = None,
     """
     if task is None and pipeline_name is None:
         raise ValueError('task or pipeline_name is required')
-
+    prefer_llm_pipeline = kwargs.get('external_engine_for_llm')
+    if task is not None and task.lower() in [
+            Tasks.text_generation, Tasks.chat
+    ]:
+        # if not specified, prefer llm pipeline for aforementioned tasks
+        if prefer_llm_pipeline is None:
+            prefer_llm_pipeline = True
+    # for llm pipeline, if llm_framework is not specified, default to swift instead
+    # TODO: port the swift infer based on transformer into ModelScope
+    if prefer_llm_pipeline and kwargs.get('llm_framework') is None:
+        kwargs['llm_framework'] = 'swift'
     third_party = kwargs.get(ThirdParty.KEY)
     if third_party is not None:
         kwargs.pop(ThirdParty.KEY)
-    model = normalize_model_input(
-        model,
-        model_revision,
-        third_party=third_party,
-        ignore_file_pattern=ignore_file_pattern)
-    if pipeline_name is None and kwargs.get('llm_first'):
-        pipeline_name = llm_first_checker(model, model_revision)
-        kwargs.pop('llm_first')
+    if pipeline_name is None and prefer_llm_pipeline:
+        pipeline_name = external_engine_for_llm_checker(
+            model, model_revision, kwargs)
+    else:
+        model = normalize_model_input(
+            model,
+            model_revision,
+            third_party=third_party,
+            ignore_file_pattern=ignore_file_pattern)
     pipeline_props = {'type': pipeline_name}
     if pipeline_name is None:
         # get default pipeline for this task
@@ -131,10 +142,16 @@ def pipeline(task: str = None,
                     model, revision=model_revision) if isinstance(
                         model, str) else read_config(
                             model[0], revision=model_revision)
-                check_config(cfg)
                 register_plugins_repo(cfg.safe_get('plugins'))
                 register_modelhub_repo(model, cfg.get('allow_remote', False))
-                pipeline_props = cfg.pipeline
+                pipeline_name = external_engine_for_llm_checker(
+                    model, model_revision,
+                    kwargs) if prefer_llm_pipeline else None
+                if pipeline_name is not None:
+                    pipeline_props = {'type': pipeline_name}
+                else:
+                    check_config(cfg)
+                    pipeline_props = cfg.pipeline
         elif model is not None:
             # get pipeline info from Model object
             first_model = model[0] if isinstance(model, list) else model
@@ -153,6 +170,7 @@ def pipeline(task: str = None,
     pipeline_props['device'] = device
     cfg = ConfigDict(pipeline_props)
 
+    clear_llm_info(kwargs)
     if kwargs:
         cfg.update(kwargs)
 
@@ -201,15 +219,27 @@ def get_default_pipeline_info(task):
     return pipeline_name, default_model
 
 
-def llm_first_checker(model: Union[str, List[str], Model, List[Model]],
-                      revision: Optional[str]) -> Optional[str]:
-    from .nlp.llm_pipeline import ModelTypeHelper, LLM_FORMAT_MAP
+def external_engine_for_llm_checker(model: Union[str, List[str], Model,
+                                                 List[Model]],
+                                    revision: Optional[str],
+                                    kwargs: Dict[str, Any]) -> Optional[str]:
+    from .nlp.llm_pipeline import ModelTypeHelper, LLMAdapterRegistry
 
     if isinstance(model, list):
         model = model[0]
     if not isinstance(model, str):
         model = model.model_dir
+
+    if kwargs.get('llm_framework') == 'swift':
+        return 'llm'
     model_type = ModelTypeHelper.get(
-        model, revision, with_adapter=True, split='-')
-    if model_type in LLM_FORMAT_MAP:
+        model, revision, with_adapter=True, split='-', use_cache=True)
+    if LLMAdapterRegistry.contains(model_type):
         return 'llm'
+
+
+def clear_llm_info(kwargs: Dict):
+    from modelscope.utils.model_type_helper import ModelTypeHelper
+
+    kwargs.pop('external_engine_for_llm', None)
+    ModelTypeHelper.clear_cache()
diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py
index 6fcd77eac..530c86a97 100644
--- a/modelscope/pipelines/cv/__init__.py
+++ b/modelscope/pipelines/cv/__init__.py
@@ -41,6 +41,7 @@
     from .image_super_resolution_pasd_pipeline import ImageSuperResolutionPASDPipeline
     from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline
     from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline
+
     from .image_inpainting_pipeline import ImageInpaintingPipeline
     from .image_paintbyexample_pipeline import ImagePaintbyexamplePipeline
     from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline
@@ -84,6 +85,7 @@
     from .video_object_segmentation_pipeline import VideoObjectSegmentationPipeline
     from .video_deinterlace_pipeline import VideoDeinterlacePipeline
     from .image_matching_pipeline import ImageMatchingPipeline
+    from .image_matching_fast_pipeline import ImageMatchingFastPipeline
     from .video_stabilization_pipeline import VideoStabilizationPipeline
     from .video_super_resolution_pipeline import VideoSuperResolutionPipeline
     from .pointcloud_sceneflow_estimation_pipeline import PointCloudSceneFlowEstimationPipeline
@@ -107,6 +109,7 @@
     from .image_human_parsing_pipeline import ImageHumanParsingPipeline
     from .nerf_recon_acc_pipeline import NeRFReconAccPipeline
     from .nerf_recon_4k_pipeline import NeRFRecon4KPipeline
+    from .image_to_3d_pipeline import Image23DPipeline
     from .surface_recon_common_pipeline import SurfaceReconCommonPipeline
     from .controllable_image_generation_pipeline import ControllableImageGenerationPipeline
     from .image_bts_depth_estimation_pipeline import ImageBTSDepthEstimationPipeline
@@ -115,6 +118,13 @@
     from .text_to_360panorama_image_pipeline import Text2360PanoramaImagePipeline
     from .human3d_render_pipeline import Human3DRenderPipeline
     from .human3d_animation_pipeline import Human3DAnimationPipeline
+    from .image_local_feature_matching_pipeline import ImageLocalFeatureMatchingPipeline
+    from .rife_video_frame_interpolation_pipeline import RIFEVideoFrameInterpolationPipeline
+    from .anydoor_pipeline import AnydoorPipeline
+    from .image_depth_estimation_marigold_pipeline import ImageDepthEstimationMarigoldPipeline
+    from .self_supervised_depth_completion_pipeline import SelfSupervisedDepthCompletionPipeline
+    from .human_normal_estimation_pipeline import HumanNormalEstimationPipeline
+
 else:
     _import_structure = {
         'action_recognition_pipeline': ['ActionRecognitionPipeline'],
@@ -163,6 +173,7 @@
         ['ProductRetrievalEmbeddingPipeline'],
         'live_category_pipeline': ['LiveCategoryPipeline'],
         'image_to_image_generate_pipeline': ['Image2ImageGenerationPipeline'],
+        'image_to_3d_pipeline': ['Image23DPipeline'],
         'image_inpainting_pipeline': ['ImageInpaintingPipeline'],
         'image_paintbyexample_pipeline': ['ImagePaintbyexamplePipeline'],
         'ocr_detection_pipeline': ['OCRDetectionPipeline'],
@@ -228,6 +239,7 @@
         ],
         'video_deinterlace_pipeline': ['VideoDeinterlacePipeline'],
         'image_matching_pipeline': ['ImageMatchingPipeline'],
+        'image_matching_fast_pipeline': ['ImageMatchingFastPipeline'],
         'video_stabilization_pipeline': ['VideoStabilizationPipeline'],
         'video_super_resolution_pipeline': ['VideoSuperResolutionPipeline'],
         'pointcloud_sceneflow_estimation_pipeline': [
@@ -269,6 +281,7 @@
         'image_human_parsing_pipeline': ['ImageHumanParsingPipeline'],
         'nerf_recon_acc_pipeline': ['NeRFReconAccPipeline'],
         'nerf_recon_4k_pipeline': ['NeRFRecon4KPipeline'],
+        'nerf_recon_img_to_mv_pipeline': ['NeRFReconImgToMVPipeline'],
         'surface_recon_common_pipeline': ['SurfaceReconCommonPipeline'],
         'controllable_image_generation_pipeline': [
             'ControllableImageGenerationPipeline'
@@ -287,6 +300,20 @@
         ],
         'human3d_render_pipeline': ['Human3DRenderPipeline'],
         'human3d_animation_pipeline': ['Human3DAnimationPipeline'],
+        'image_local_feature_matching_pipeline': [
+            'ImageLocalFeatureMatchingPipeline'
+        ],
+        'rife_video_frame_interpolation_pipeline': [
+            'RIFEVideoFrameInterpolationPipeline'
+        ],
+        'anydoor_pipeline': ['AnydoorPipeline'],
+        'image_depth_estimation_marigold_pipeline': [
+            'ImageDepthEstimationMarigoldPipeline'
+        ],
+        'self_supervised_depth_completion_pipeline': [
+            'SelfSupervisedDepthCompletionPipeline'
+        ],
+        'human_normal_estimation_pipeline': ['HumanNormalEstimationPipeline'],
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/anydoor_pipeline.py b/modelscope/pipelines/cv/anydoor_pipeline.py
new file mode 100644
index 000000000..397cd21d7
--- /dev/null
+++ b/modelscope/pipelines/cv/anydoor_pipeline.py
@@ -0,0 +1,290 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict
+
+import cv2
+import einops
+import numpy as np
+import requests
+import torch
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.anydoor.cldm.ddim_hacked import DDIMSampler
+from modelscope.models.cv.anydoor.datasets.data_utils import (
+    box2squre, box_in_box, expand_bbox, expand_image_mask, get_bbox_from_mask,
+    pad_to_square, sobel)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors.image import load_image
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_to_image_generation, module_name=Pipelines.anydoor)
+class AnydoorPipeline(Pipeline):
+    r""" AnyDoor Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> from PIL import Image
+
+    >>> ref_image = 'data/test/images/image_anydoor_fg.png'
+    >>> ref_mask = 'data/test/images/image_anydoor_fg_mask.png'
+    >>> bg_image = 'data/test/images/image_anydoor_bg.png'
+    >>> bg_mask = 'data/test/images/image_anydoor_bg_mask.png'
+
+    >>> anydoor_pipeline = pipeline(Tasks.image_to_image_generation, model='damo/AnyDoor')
+    >>> out = anydoor_pipeline((ref_image, ref_mask, bg_image, bg_mask))
+    >>> assert isinstance(out['output_img'], Image.Image)
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a action detection pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        model_ckpt = os.path.join(self.model.model_dir,
+                                  self.cfg.model.model_path)
+        self.model.load_state_dict(
+            self._get_state_dict(model_ckpt, location='cuda'))
+        self.ddim_sampler = DDIMSampler(self.model)
+
+    @staticmethod
+    def _get_state_dict(ckpt_path, location='cpu'):
+
+        def get_state_dict(d):
+            return d.get('state_dict', d)
+
+        _, extension = os.path.splitext(ckpt_path)
+        if extension.lower() == '.safetensors':
+            import safetensors.torch
+            state_dict = safetensors.torch.load_file(
+                ckpt_path, device=location)
+        else:
+            state_dict = get_state_dict(
+                torch.load(ckpt_path, map_location=torch.device(location)))
+        state_dict = get_state_dict(state_dict)
+        print(f'Loaded state_dict from [{ckpt_path}]')
+        return state_dict
+
+    def preprocess(self, inputs: Input) -> Dict[str, Any]:
+        ref_image, ref_mask, tar_image, tar_mask = inputs
+        ref_image = np.asarray(load_image(ref_image).convert('RGB'))
+        ref_mask = np.where(
+            np.asarray(load_image(ref_mask).convert('L')) > 128, 1,
+            0).astype(np.uint8)
+        tar_image = np.asarray(load_image(tar_image).convert('RGB'))
+        tar_mask = np.where(
+            np.asarray(load_image(tar_mask).convert('L')) > 128, 1,
+            0).astype(np.uint8)
+
+        # ========= Reference ===========
+        # ref expand
+        ref_box_yyxx = get_bbox_from_mask(ref_mask)
+
+        # ref filter mask
+        ref_mask_3 = np.stack([ref_mask, ref_mask, ref_mask], -1)
+        masked_ref_image = ref_image * ref_mask_3 + np.ones_like(
+            ref_image) * 255 * (1 - ref_mask_3)
+
+        y1, y2, x1, x2 = ref_box_yyxx
+        masked_ref_image = masked_ref_image[y1:y2, x1:x2, :]
+        ref_mask = ref_mask[y1:y2, x1:x2]
+
+        ratio = np.random.randint(11, 15) / 10  # 11,13
+        masked_ref_image, ref_mask = expand_image_mask(
+            masked_ref_image, ref_mask, ratio=ratio)
+        ref_mask_3 = np.stack([ref_mask, ref_mask, ref_mask], -1)
+
+        # to square and resize
+        masked_ref_image = pad_to_square(
+            masked_ref_image, pad_value=255, random=False)
+        masked_ref_image = cv2.resize(
+            masked_ref_image.astype(np.uint8), (224, 224)).astype(np.uint8)
+
+        ref_mask_3 = pad_to_square(ref_mask_3 * 255, pad_value=0, random=False)
+        ref_mask_3 = cv2.resize(ref_mask_3.astype(np.uint8),
+                                (224, 224)).astype(np.uint8)
+        ref_mask = ref_mask_3[:, :, 0]
+
+        # collage aug
+        masked_ref_image_compose, ref_mask_compose = masked_ref_image, ref_mask
+        ref_mask_3 = np.stack(
+            [ref_mask_compose, ref_mask_compose, ref_mask_compose], -1)
+        ref_image_collage = sobel(masked_ref_image_compose,
+                                  ref_mask_compose / 255)
+
+        # ========= Target ===========
+        tar_box_yyxx = get_bbox_from_mask(tar_mask)
+        tar_box_yyxx = expand_bbox(
+            tar_mask, tar_box_yyxx, ratio=[1.1, 1.2])  # 1.1  1.3
+
+        # crop
+        tar_box_yyxx_crop = expand_bbox(
+            tar_image, tar_box_yyxx, ratio=[1.3, 3.0])
+        tar_box_yyxx_crop = box2squre(tar_image, tar_box_yyxx_crop)  # crop box
+        y1, y2, x1, x2 = tar_box_yyxx_crop
+
+        cropped_target_image = tar_image[y1:y2, x1:x2, :]
+        cropped_tar_mask = tar_mask[y1:y2, x1:x2]
+
+        tar_box_yyxx = box_in_box(tar_box_yyxx, tar_box_yyxx_crop)
+        y1, y2, x1, x2 = tar_box_yyxx
+
+        # collage
+        ref_image_collage = cv2.resize(
+            ref_image_collage.astype(np.uint8), (x2 - x1, y2 - y1))
+        ref_mask_compose = cv2.resize(
+            ref_mask_compose.astype(np.uint8), (x2 - x1, y2 - y1))
+        ref_mask_compose = (ref_mask_compose > 128).astype(np.uint8)
+
+        collage = cropped_target_image.copy()
+        collage[y1:y2, x1:x2, :] = ref_image_collage
+
+        collage_mask = cropped_target_image.copy() * 0.0
+        collage_mask[y1:y2, x1:x2, :] = 1.0
+        collage_mask = np.stack(
+            [cropped_tar_mask, cropped_tar_mask, cropped_tar_mask], -1)
+
+        # the size before pad
+        H1, W1 = collage.shape[0], collage.shape[1]
+
+        cropped_target_image = pad_to_square(
+            cropped_target_image, pad_value=0, random=False).astype(np.uint8)
+        collage = pad_to_square(
+            collage, pad_value=0, random=False).astype(np.uint8)
+        collage_mask = pad_to_square(
+            collage_mask, pad_value=0, random=False).astype(np.uint8)
+
+        # the size after pad
+        H2, W2 = collage.shape[0], collage.shape[1]
+
+        cropped_target_image = cv2.resize(
+            cropped_target_image.astype(np.uint8),
+            (512, 512)).astype(np.float32)
+        collage = cv2.resize(collage.astype(np.uint8),
+                             (512, 512)).astype(np.float32)
+        collage_mask = (cv2.resize(collage_mask.astype(
+            np.uint8), (512, 512)).astype(np.float32) > 0.5).astype(np.float32)
+
+        masked_ref_image = masked_ref_image / 255
+        cropped_target_image = cropped_target_image / 127.5 - 1.0
+        collage = collage / 127.5 - 1.0
+        collage = np.concatenate([collage, collage_mask[:, :, :1]], -1)
+
+        item = dict(
+            tar_image=tar_image,
+            ref=masked_ref_image.copy(),
+            jpg=cropped_target_image.copy(),
+            hint=collage.copy(),
+            extra_sizes=np.array([H1, W1, H2, W2]),
+            tar_box_yyxx_crop=np.array(tar_box_yyxx_crop))
+        return item
+
+    def forward(self,
+                item: Dict[str, Any],
+                num_samples=1,
+                strength=1.0,
+                ddim_steps=30,
+                scale=3.0) -> Dict[str, Any]:
+        tar_image = item['tar_image'].cpu().numpy()
+        ref = item['ref']
+        hint = item['hint']
+        num_samples = 1
+
+        control = hint.float().cuda()
+        control = torch.stack([control for _ in range(num_samples)], dim=0)
+        control = einops.rearrange(control, 'b h w c -> b c h w').clone()
+
+        clip_input = ref.float().cuda()
+        clip_input = torch.stack([clip_input for _ in range(num_samples)],
+                                 dim=0)
+        clip_input = einops.rearrange(clip_input, 'b h w c -> b c h w').clone()
+
+        H, W = 512, 512
+
+        cond = {
+            'c_concat': [control],
+            'c_crossattn': [self.model.get_learned_conditioning(clip_input)]
+        }
+        un_cond = {
+            'c_concat': [control],
+            'c_crossattn': [
+                self.model.get_learned_conditioning(
+                    [torch.zeros((1, 3, 224, 224))] * num_samples)
+            ]
+        }
+        shape = (4, H // 8, W // 8)
+
+        self.model.control_scales = ([strength] * 13)
+        samples, _ = self.ddim_sampler.sample(
+            ddim_steps,
+            num_samples,
+            shape,
+            cond,
+            verbose=False,
+            eta=0,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=un_cond)
+
+        x_samples = self.model.decode_first_stage(samples)
+        x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5
+                     + 127.5).cpu().numpy()
+
+        result = x_samples[0][:, :, ::-1]
+        result = np.clip(result, 0, 255)
+
+        pred = x_samples[0]
+        pred = np.clip(pred, 0, 255)[1:, :, :]
+        sizes = item['extra_sizes'].cpu().numpy()
+        tar_box_yyxx_crop = item['tar_box_yyxx_crop'].cpu().numpy()
+        return dict(
+            pred=pred,
+            tar_image=tar_image,
+            sizes=sizes,
+            tar_box_yyxx_crop=tar_box_yyxx_crop)
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        pred = inputs['pred']
+        tar_image = inputs['tar_image']
+        extra_sizes = inputs['sizes']
+        tar_box_yyxx_crop = inputs['tar_box_yyxx_crop']
+
+        H1, W1, H2, W2 = extra_sizes
+        y1, y2, x1, x2 = tar_box_yyxx_crop
+        pred = cv2.resize(pred, (W2, H2))
+        m = 3  # maigin_pixel
+
+        if W1 == H1:
+            tar_image[y1 + m:y2 - m, x1 + m:x2 - m, :] = pred[m:-m, m:-m]
+            gen_image = torch.from_numpy(tar_image.copy()).permute(2, 0, 1)
+            gen_image = gen_image.permute(1, 2, 0).numpy()
+            gen_image = Image.fromarray(gen_image, mode='RGB')
+            return {OutputKeys.OUTPUT_IMG: gen_image}
+
+        if W1 < W2:
+            pad1 = int((W2 - W1) / 2)
+            pad2 = W2 - W1 - pad1
+            pred = pred[:, pad1:-pad2, :]
+        else:
+            pad1 = int((H2 - H1) / 2)
+            pad2 = H2 - H1 - pad1
+            pred = pred[pad1:-pad2, :, :]
+
+        gen_image = tar_image.copy()
+        gen_image[y1 + m:y2 - m, x1 + m:x2 - m, :] = pred[m:-m, m:-m]
+
+        gen_image = torch.from_numpy(gen_image).permute(2, 0, 1)
+        gen_image = gen_image.permute(1, 2, 0).numpy()
+        gen_image = Image.fromarray(gen_image, mode='RGB')
+        return {OutputKeys.OUTPUT_IMG: gen_image}
diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
index af1e08fe8..c9e5036a1 100644
--- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
+++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py
@@ -16,7 +16,7 @@
 from matplotlib.ticker import MultipleLocator
 
 from modelscope.metainfo import Pipelines
-from modelscope.models.cv.body_3d_keypoints.cannonical_pose.body_3d_pose import \
+from modelscope.models.cv.body_3d_keypoints.canonical_pose.body_3d_pose import \
     KeypointsTypes
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines import pipeline
diff --git a/modelscope/pipelines/cv/dense_optical_flow_estimation_pipeline.py b/modelscope/pipelines/cv/dense_optical_flow_estimation_pipeline.py
new file mode 100644
index 000000000..f734fd97c
--- /dev/null
+++ b/modelscope/pipelines/cv/dense_optical_flow_estimation_pipeline.py
@@ -0,0 +1,147 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import InputPadder, flow_to_color
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.dense_optical_flow_estimation,
+    module_name=Pipelines.dense_optical_flow_estimation)
+class DenseOpticalFlowEstimationPipeline(Pipeline):
+    r""" Card Detection Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+
+    >>> estimator = pipeline(Tasks.dense_optical_flow_estimation, model='Damo_XR_Lab/cv_raft_dense-optical-flow_things')
+    >>> estimator([[
+    >>>         'modelscope/models/cv/dense_optical_flow_estimation/data/test/images/dense_flow1.png',
+    >>>         'modelscope/models/cv/dense_optical_flow_estimation/data/test/images/dense_flow2.png'
+    >>>          ]])
+    >>> [{'flows': tensor([[[[-1.6319, -1.6348, -1.6363,  ..., -1.7191, -1.7136, -1.7085],
+    >>>           [-1.6324, -1.6344, -1.6351,  ..., -1.7110, -1.7048, -1.7005],
+    >>>           [-1.6318, -1.6326, -1.6329,  ..., -1.7080, -1.7050, -1.7031],
+    >>>           ...,
+    >>>           [-2.0998, -2.1007, -2.0958,  ..., -1.4086, -1.4055, -1.3996],
+    >>>           [-2.1043, -2.1031, -2.0988,  ..., -1.4075, -1.4049, -1.3991],
+    >>>           [-2.1016, -2.0985, -2.0939,  ..., -1.4062, -1.4029, -1.3969]],
+    >>>
+    >>>          [[ 0.0343,  0.0386,  0.0401,  ...,  0.8053,  0.8050,  0.8057],
+    >>>           [ 0.0311,  0.0354,  0.0369,  ...,  0.8004,  0.8007,  0.8050],
+    >>>           [ 0.0274,  0.0309,  0.0322,  ...,  0.8007,  0.8016,  0.8080],
+    >>>           ...,
+    >>>           [ 0.5685,  0.5785,  0.5740,  ...,  0.4003,  0.4153,  0.4365],
+    >>>           [ 0.5994,  0.6000,  0.5899,  ...,  0.4057,  0.4218,  0.4447],
+    >>>           [ 0.6137,  0.6076,  0.5920,  ...,  0.4147,  0.4299,  0.4538]]]],
+    >>>        device='cuda:0'), 'flows_color': array([[[255, 249, 219],
+    >>>         [255, 249, 219],
+    >>>         [255, 249, 219],
+    >>>         ...,
+    >>>         [236, 255, 213],
+    >>>         [236, 255, 213],
+    >>>         [236, 255, 213]],
+    >>>
+    >>>        [[255, 249, 219],
+    >>>         [255, 249, 219],
+    >>>         [255, 249, 219],
+    >>>         ...,
+    >>>         [236, 255, 213],
+    >>>         [236, 255, 213],
+    >>>         [236, 255, 213]],
+    >>>
+    >>>        [[255, 249, 219],
+    >>>         [255, 249, 219],
+    >>>         [255, 249, 219],
+    >>>         ...,
+    >>>         [236, 255, 213],
+    >>>         [236, 255, 213],
+    >>>         [236, 255, 213]],
+    >>>
+    >>>        ...,
+    >>>
+    >>>        [[251, 255, 207],
+    >>>         [251, 255, 207],
+    >>>         [251, 255, 207],
+    >>>         ...,
+    >>>         [251, 255, 222],
+    >>>         [251, 255, 222],
+    >>>         [250, 255, 222]],
+    >>>
+    >>>        [[250, 255, 207],
+    >>>         [250, 255, 207],
+    >>>         [250, 255, 207],
+    >>>         ...,
+    >>>         [251, 255, 222],
+    >>>         [250, 255, 222],
+    >>>         [249, 255, 222]],
+    >>>
+    >>>        [[249, 255, 207],
+    >>>         [249, 255, 207],
+    >>>         [250, 255, 207],
+    >>>         ...,
+    >>>         [251, 255, 222],
+    >>>         [250, 255, 222],
+    >>>         [249, 255, 222]]], dtype=uint8)}]
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image depth estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('dense optical flow estimation model, pipeline init')
+
+    def load_image(self, img_name):
+        img = LoadImage.convert_to_ndarray(img_name).astype(np.float32)
+        img = img.transpose(2, 0, 1)
+
+        return img
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img1 = self.load_image(input[0])
+        img2 = self.load_image(input[1])
+
+        image1 = torch.from_numpy(img1)[None].cuda().float()
+        image2 = torch.from_numpy(img2)[None].cuda().float()
+
+        padder = InputPadder(image1.shape)
+        image1, image2 = padder.pad(image1, image2)
+
+        data = {'image1': image1, 'image2': image2}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        flow_ups = self.model.inference(input)
+        results = flow_ups[-1]
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        out = self.model.postprocess(inputs)
+        flows_color = flow_to_color([out[OutputKeys.FLOWS]])
+        flows_color = flows_color[:, :, [2, 1, 0]]
+        outputs = {
+            OutputKeys.FLOWS: out[OutputKeys.FLOWS],
+            OutputKeys.FLOWS_COLOR: flows_color
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/facial_68ldk_detection_pipeline.py b/modelscope/pipelines/cv/facial_68ldk_detection_pipeline.py
new file mode 100644
index 000000000..5290af242
--- /dev/null
+++ b/modelscope/pipelines/cv/facial_68ldk_detection_pipeline.py
@@ -0,0 +1,88 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import argparse
+import os
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.facial_68ldk_detection import infer
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.facial_68ldk_detection, module_name=Pipelines.facial_68ldk_detection)
+class FaceLandmarkDetectionPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image depth prediction pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        parser = argparse.ArgumentParser(description='Evaluation script')
+        args = parser.parse_args()
+        args.config_name = 'alignment'
+
+        device_ids = list()
+        if torch.cuda.is_available():
+            device_ids = [0]
+        else:
+            device_ids = [-1]
+
+        model_path = os.path.join(model, 'pytorch_model.pkl')
+
+        self.fld = infer.Alignment(
+            args, model_path, dl_framework='pytorch', device_ids=device_ids)
+
+        logger.info('Face 2d landmark detection model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        print('start preprocess')
+
+        image = LoadImage.convert_to_ndarray(input)
+        image = cv2.resize(image, (256, 256))
+
+        data = {'image': image}
+
+        print('finish preprocess')
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        print('start infer')
+
+        image = input['image']
+
+        if torch.cuda.is_available():
+            image_np = image.cpu().numpy()
+        else:
+            image_np = image.numpy()
+
+        x1, y1, x2, y2 = 0, 0, 256, 256
+        scale = max(x2 - x1, y2 - y1) / 180
+        center_w = (x1 + x2) / 2
+        center_h = (y1 + y2) / 2
+        scale, center_w, center_h = float(scale), float(center_w), float(
+            center_h)
+
+        results = self.fld.analyze(image_np, scale, center_w, center_h)
+
+        print('finish infer')
+
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        outputs = {'landmarks': inputs}
+        return outputs
diff --git a/modelscope/pipelines/cv/human3d_animation_pipeline.py b/modelscope/pipelines/cv/human3d_animation_pipeline.py
index d03cd8a3e..4e5ab46db 100644
--- a/modelscope/pipelines/cv/human3d_animation_pipeline.py
+++ b/modelscope/pipelines/cv/human3d_animation_pipeline.py
@@ -72,7 +72,7 @@ def gen_weights(self, save_dir=None):
                                      (case_name, action_name))
         exec_path = os.path.join(self.model_dir, 'skinning.py')
 
-        cmd = f'blender -b -P {exec_path}  -- --input {self.case_dir}' \
+        cmd = f'{self.blender} -b -P {exec_path}  -- --input {self.case_dir}' \
               f' --gltf_path {gltf_path} --action {self.action}'
         os.system(cmd)
         return gltf_path
@@ -83,9 +83,6 @@ def animate(self, mesh_path, action_dir, action, save_dir=None):
         mesh = read_obj(mesh_path)
         tex = cv2.imread(tex_path)
         vertices = mesh['vertices']
-        cent = (vertices.max(axis=0) + vertices.min(axis=0)) / 2
-        new_cent = (0, 1.8 / 2, 0)
-        vertices -= (cent - new_cent)
         mesh['vertices'] = vertices
         mesh['texture_map'] = tex
         write_obj(mesh_path, mesh)
@@ -108,6 +105,11 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         else:
             save_dir = None
 
+        if 'blender' in input:
+            self.blender = input['blender']
+        else:
+            self.blender = 'blender'
+
         if case_id.endswith('.obj'):
             mesh_path = case_id
         else:
diff --git a/modelscope/pipelines/cv/human3d_render_pipeline.py b/modelscope/pipelines/cv/human3d_render_pipeline.py
index 44d0bb21d..cf506d190 100644
--- a/modelscope/pipelines/cv/human3d_render_pipeline.py
+++ b/modelscope/pipelines/cv/human3d_render_pipeline.py
@@ -68,6 +68,8 @@ def load_3d_model(self, mesh_path):
 
     def format_nvdiffrast_format(self, mesh, tex):
         vert = mesh['vertices']
+        cent = (vert.max(axis=0) + vert.min(axis=0)) / 2
+        vert -= cent
         tri = mesh['faces']
         tri = tri - 1 if tri.min() == 1 else tri
         vert_uv = mesh['uvs']
@@ -81,7 +83,7 @@ def format_nvdiffrast_format(self, mesh, tex):
         tex = torch.from_numpy(tex.astype(np.float32) / 255.0).cuda()
         return vtx_pos, pos_idx, vtx_uv, uv_idx, tex
 
-    def render_scene(self, mesh_path):
+    def render_scene(self, mesh_path, resolution=512):
         if not os.path.exists(mesh_path):
             logger.info('can not found %s, use default one' % mesh_path)
             mesh_path = os.path.join(self.model_dir, '3D-assets',
@@ -99,8 +101,8 @@ def render_scene(self, mesh_path):
         frames_normals = []
         for i in tqdm.tqdm(range(frame_length)):
             proj = projection(x=0.4, n=1.0, f=200.0)
-            a_rot = np.matmul(rotate_x(-0.1), rotate_y(ang))
-            a_mv = np.matmul(translate(0, 0, -2.5), a_rot)
+            a_rot = np.matmul(rotate_x(0.0), rotate_y(ang))
+            a_mv = np.matmul(translate(0, 0, -2.7), a_rot)
             r_mvp = np.matmul(proj, a_mv).astype(np.float32)
             pred_img, pred_mask, normal = render(
                 glctx,
@@ -110,7 +112,7 @@ def render_scene(self, mesh_path):
                 vtx_uv,
                 uv_idx,
                 tex,
-                resolution=512,
+                resolution=resolution,
                 enable_mip=False,
                 max_mip_level=9)
             color = np.clip(
@@ -123,7 +125,7 @@ def render_scene(self, mesh_path):
             frames_normals.append(normals)
             ang = ang + step
 
-        logger.info('load case %s done'
+        logger.info('render case %s done'
                     % os.path.basename(os.path.dirname(mesh_path)))
 
         return mesh, frames_color, frames_normals
@@ -131,6 +133,10 @@ def render_scene(self, mesh_path):
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         dataset_id = input['dataset_id']
         case_id = input['case_id']
+        if 'resolution' in input:
+            resolution = input['resolution']
+        else:
+            resolution = 512
         if case_id.endswith('.obj'):
             mesh_path = case_id
         else:
@@ -142,7 +148,7 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
             case_dir = os.path.join(data_dir, case_id)
             mesh_path = os.path.join(case_dir, 'body.obj')
 
-        mesh, colors, normals = self.render_scene(mesh_path)
+        mesh, colors, normals = self.render_scene(mesh_path, resolution)
 
         results = {
             'mesh': mesh,
diff --git a/modelscope/pipelines/cv/human_normal_estimation_pipeline.py b/modelscope/pipelines/cv/human_normal_estimation_pipeline.py
new file mode 100644
index 000000000..bd19b18da
--- /dev/null
+++ b/modelscope/pipelines/cv/human_normal_estimation_pipeline.py
@@ -0,0 +1,95 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+import numpy as np
+from PIL import Image
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.human_normal_estimation,
+    module_name=Pipelines.human_normal_estimation)
+class HumanNormalEstimationPipeline(Pipeline):
+    r""" Human Normal Estimation Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+
+    >>> estimator = pipeline(
+    >>>        Tasks.human_normal_estimation, model='Damo_XR_Lab/cv_human_monocular-normal-estimation')
+    >>> estimator(f"{model_dir}/tests/image_normal_estimation.jpg")
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image normal estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+        logger.info('normal estimation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        """
+
+        Args:
+            input: string or ndarray or Image.Image
+
+        Returns:
+            data: dict including inference inputs
+        """
+        if isinstance(input, str):
+            img = np.array(Image.open(input))
+        if isinstance(input, Image.Image):
+            img = np.array(input)
+
+        img_h, img_w, img_ch = img.shape[0:3]
+
+        if img_ch == 3:
+            msk = np.full((img_h, img_w, 1), 255, dtype=np.uint8)
+            img = np.concatenate((img, msk), axis=-1)
+
+        H, W = 1024, 1024
+        scale_factor = min(W / img_w, H / img_h)
+        img = Image.fromarray(img)
+        img = img.resize(
+            (int(img_w * scale_factor), int(img_h * scale_factor)),
+            Image.LANCZOS)
+
+        new_img = Image.new('RGBA', (W, H), color=(0, 0, 0, 0))
+        paste_pos_w = (W - img.width) // 2
+        paste_pos_h = (H - img.height) // 2
+        new_img.paste(img, (paste_pos_w, paste_pos_h))
+
+        bbox = (paste_pos_w, paste_pos_h, paste_pos_w + img.width,
+                paste_pos_h + img.height)
+        img = np.array(new_img)
+
+        data = {'img': img[:, :, 0:3], 'msk': img[:, :, -1], 'bbox': bbox}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        normals = results[OutputKeys.NORMALS]
+
+        normals_vis = (((normals + 1) * 0.5) * 255).astype(np.uint8)
+        normals_vis = normals_vis[..., [2, 1, 0]]
+        outputs = {
+            OutputKeys.NORMALS: normals,
+            OutputKeys.NORMALS_COLOR: normals_vis
+        }
+        return outputs
diff --git a/modelscope/pipelines/cv/image_depth_estimation_marigold_pipeline.py b/modelscope/pipelines/cv/image_depth_estimation_marigold_pipeline.py
new file mode 100644
index 000000000..e5cdd7e7c
--- /dev/null
+++ b/modelscope/pipelines/cv/image_depth_estimation_marigold_pipeline.py
@@ -0,0 +1,409 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+from typing import Any, Dict, Union
+
+import numpy as np
+import torch
+from diffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline,
+                       UNet2DConditionModel)
+from PIL import Image
+from torch.utils.data import DataLoader, TensorDataset
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.image_depth_estimation_marigold import (
+    MarigoldDepthOutput, chw2hwc, colorize_depth_maps, ensemble_depths,
+    find_batch_size, inter_distances, resize_max_res)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_depth_estimation,
+    module_name=Pipelines.image_depth_estimation_marigold)
+class ImageDepthEstimationMarigoldPipeline(Pipeline):
+
+    def __init__(self, model=str, **kwargs):
+        r"""
+        use `model` to create a image depth estimation pipeline for prediction
+        Args:
+        >>> model: modelscope model_id "Damo_XR_Lab/cv_marigold_monocular-depth-estimation"
+
+        Examples:
+
+        >>> from modelscope.pipelines import pipeline
+        >>> from modelscope.utils.constant import Tasks
+        >>> from modelscope.outputs import OutputKeys
+        >>>
+        >>> output_image_path = './result.png'
+        >>> img = './test.jpg'
+        >>>
+        >>> pipe = pipeline(
+        >>>     Tasks.image_depth_estimation,
+        >>>     model='Damo_XR_Lab/cv_marigold_monocular-depth-estimation')
+        >>>
+        >>> depth_vis = pipe(input)[OutputKeys.DEPTHS_COLOR]
+        >>> depth_vis.save(output_image_path)
+        >>> print('pipeline: the output image path is {}'.format(output_image_path))
+
+        """
+        super().__init__(model=model, **kwargs)
+
+        self._device = getattr(
+            kwargs, 'device',
+            torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
+        self._dtype = torch.float16
+        logger.info('load depth estimation marigold pipeline done')
+
+        self.checkpoint_path = os.path.join(model, 'Marigold_v1_merged_2')
+        self.pipeline = _MarigoldPipeline.from_pretrained(
+            self.checkpoint_path, torch_dtype=self._dtype)
+        self.pipeline.to(self._device)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        # print('pipeline preprocess')
+        # TODO: input type: Image
+        return input
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        self.input_image = Image.open(input)
+        # print('load', input, self.input_image.size)
+
+        results = self.pipeline(self.input_image)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        depths: np.ndarray = inputs.depth_np
+        depths_color: Image.Image = inputs.depth_colored
+        outputs = {
+            OutputKeys.DEPTHS: depths,
+            OutputKeys.DEPTHS_COLOR: depths_color
+        }
+        return outputs
+
+
+class _MarigoldPipeline(DiffusionPipeline):
+    """
+    Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io.
+
+    This model inherits from [`DiffusionPipeline`].
+    Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+
+    Args:
+        unet (`UNet2DConditionModel`):
+            Conditional U-Net to denoise the depth latent, conditioned on image latent.
+        vae (`AutoencoderKL`):
+            Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps
+            to and from latent representations.
+        scheduler (`DDIMScheduler`):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latents.
+        text_encoder (`CLIPTextModel`):
+            Text-encoder, for empty text embedding.
+        tokenizer (`CLIPTokenizer`):
+            CLIP tokenizer.
+    """
+    rgb_latent_scale_factor = 0.18215
+    depth_latent_scale_factor = 0.18215
+
+    def __init__(
+        self,
+        unet: UNet2DConditionModel,
+        vae: AutoencoderKL,
+        scheduler: DDIMScheduler,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            unet=unet,
+            vae=vae,
+            scheduler=scheduler,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+        )
+
+        self.empty_text_embed = None
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        input_image: Image,
+        denoising_steps: int = 10,
+        ensemble_size: int = 10,
+        processing_res: int = 768,
+        match_input_res: bool = True,
+        batch_size: int = 0,
+        color_map: str = 'Spectral',
+        show_progress_bar: bool = True,
+        ensemble_kwargs: Dict = None,
+    ) -> MarigoldDepthOutput:
+        r"""
+        Function invoked when calling the pipeline.
+
+        Args:
+            input_image (`Image`):
+                Input RGB (or gray-scale) image.
+            processing_res (`int`, *optional*, defaults to `768`):
+                Maximum resolution of processing.
+                If set to 0: will not resize at all.
+            match_input_res (`bool`, *optional*, defaults to `True`):
+                Resize depth prediction to match input resolution.
+                Only valid if `limit_input_res` is not None.
+            denoising_steps (`int`, *optional*, defaults to `10`):
+                Number of diffusion denoising steps (DDIM) during inference.
+            ensemble_size (`int`, *optional*, defaults to `10`):
+                Number of predictions to be ensembled.
+            batch_size (`int`, *optional*, defaults to `0`):
+                Inference batch size, no bigger than `num_ensemble`.
+                If set to 0, the script will automatically decide the proper batch size.
+            show_progress_bar (`bool`, *optional*, defaults to `True`):
+                Display a progress bar of diffusion denoising.
+            color_map (`str`, *optional*, defaults to `"Spectral"`):
+                Colormap used to colorize the depth map.
+            ensemble_kwargs (`dict`, *optional*, defaults to `None`):
+                Arguments for detailed ensembling settings.
+        Returns:
+            `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including:
+            - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1]
+            - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W]
+                    and values in [0, 1]
+            - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation)
+                    coming from ensembling. None if `ensemble_size = 1`
+        """
+
+        device = self.device
+        input_size = input_image.size
+
+        if not match_input_res:
+            assert (processing_res is not None
+                    ), 'Value error: `resize_output_back` is only valid with '
+        assert processing_res >= 0
+        assert denoising_steps >= 1
+        assert ensemble_size >= 1
+
+        # ----------------- Image Preprocess -----------------
+        # Resize image
+        if processing_res > 0:
+            input_image = resize_max_res(
+                input_image, max_edge_resolution=processing_res)
+        # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel
+        input_image = input_image.convert('RGB')
+        image = np.asarray(input_image)
+
+        # Normalize rgb values
+        rgb = np.transpose(image, (2, 0, 1))  # [H, W, rgb] -> [rgb, H, W]
+        rgb_norm = rgb / 255.0
+        rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype)
+        rgb_norm = rgb_norm.to(device)
+        assert rgb_norm.min() >= 0.0 and rgb_norm.max() <= 1.0
+
+        # ----------------- Predicting depth -----------------
+        # Batch repeated input image
+        duplicated_rgb = torch.stack([rgb_norm] * ensemble_size)
+        single_rgb_dataset = TensorDataset(duplicated_rgb)
+        if batch_size > 0:
+            _bs = batch_size
+        else:
+            _bs = find_batch_size(
+                ensemble_size=ensemble_size,
+                input_res=max(rgb_norm.shape[1:]),
+                dtype=self.dtype,
+            )
+
+        single_rgb_loader = DataLoader(
+            single_rgb_dataset, batch_size=_bs, shuffle=False)
+
+        # Predict depth maps (batched)
+        depth_pred_ls = []
+        if show_progress_bar:
+            iterable = tqdm(
+                single_rgb_loader,
+                desc=' ' * 2 + 'Inference batches',
+                leave=False)
+        else:
+            iterable = single_rgb_loader
+        for batch in iterable:
+            (batched_img, ) = batch
+            depth_pred_raw = self.single_infer(
+                rgb_in=batched_img,
+                num_inference_steps=denoising_steps,
+                show_pbar=show_progress_bar,
+            )
+            depth_pred_ls.append(depth_pred_raw.detach().clone())
+        depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze()
+        torch.cuda.empty_cache()  # clear vram cache for ensembling
+
+        # ----------------- Test-time ensembling -----------------
+        if ensemble_size > 1:
+            depth_pred, pred_uncert = ensemble_depths(
+                depth_preds, **(ensemble_kwargs or {}))
+        else:
+            depth_pred = depth_preds
+            pred_uncert = None
+
+        # ----------------- Post processing -----------------
+        # Scale prediction to [0, 1]
+        min_d = torch.min(depth_pred)
+        max_d = torch.max(depth_pred)
+        depth_pred = (depth_pred - min_d) / (max_d - min_d)
+
+        # Convert to numpy
+        depth_pred = depth_pred.cpu().numpy().astype(np.float32)
+
+        # Resize back to original resolution
+        if match_input_res:
+            pred_img = Image.fromarray(depth_pred)
+            pred_img = pred_img.resize(input_size)
+            depth_pred = np.asarray(pred_img)
+
+        # Clip output range
+        depth_pred = depth_pred.clip(0, 1)
+
+        # Colorize
+        depth_colored = colorize_depth_maps(
+            depth_pred, 0, 1,
+            cmap=color_map).squeeze()  # [3, H, W], value in (0, 1)
+        depth_colored = (depth_colored * 255).astype(np.uint8)
+        depth_colored_hwc = chw2hwc(depth_colored)
+        depth_colored_img = Image.fromarray(depth_colored_hwc)
+        return MarigoldDepthOutput(
+            depth_np=depth_pred,
+            depth_colored=depth_colored_img,
+            uncertainty=pred_uncert,
+        )
+
+    def __encode_empty_text(self):
+        """
+        Encode text embedding for empty prompt
+        """
+        prompt = ''
+        text_inputs = self.tokenizer(
+            prompt,
+            padding='do_not_pad',
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors='pt',
+        )
+        text_input_ids = text_inputs.input_ids.to(self.text_encoder.device)
+        self.empty_text_embed = self.text_encoder(text_input_ids)[0].to(
+            self.dtype)
+
+    @torch.no_grad()
+    def single_infer(self, rgb_in: torch.Tensor, num_inference_steps: int,
+                     show_pbar: bool) -> torch.Tensor:
+        r"""
+        Perform an individual depth prediction without ensembling.
+
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image.
+            num_inference_steps (`int`):
+                Number of diffusion denoisign steps (DDIM) during inference.
+            show_pbar (`bool`):
+                Display a progress bar of diffusion denoising.
+        Returns:
+            `torch.Tensor`: Predicted depth map.
+        """
+        device = rgb_in.device
+
+        # Set timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps  # [T]
+
+        # Encode image
+        rgb_latent = self.encode_rgb(rgb_in)
+
+        # Initial depth map (noise)
+        depth_latent = torch.randn(
+            rgb_latent.shape, device=device, dtype=self.dtype)  # [B, 4, h, w]
+
+        # Batched empty text embedding
+        if self.empty_text_embed is None:
+            self.__encode_empty_text()
+        batch_empty_text_embed = self.empty_text_embed.repeat(
+            (rgb_latent.shape[0], 1, 1))  # [B, 2, 1024]
+
+        # Denoising loop
+        if show_pbar:
+            iterable = tqdm(
+                enumerate(timesteps),
+                total=len(timesteps),
+                leave=False,
+                desc=' ' * 4 + 'Diffusion denoising',
+            )
+        else:
+            iterable = enumerate(timesteps)
+
+        for i, t in iterable:
+            unet_input = torch.cat([rgb_latent, depth_latent],
+                                   dim=1)  # this order is important
+
+            # predict the noise residual
+            noise_pred = self.unet(
+                unet_input, t, encoder_hidden_states=batch_empty_text_embed
+            ).sample  # [B, 4, h, w]
+
+            # compute the previous noisy sample x_t -> x_t-1
+            depth_latent = self.scheduler.step(noise_pred, t,
+                                               depth_latent).prev_sample
+        torch.cuda.empty_cache()
+        depth = self.decode_depth(depth_latent)
+
+        # clip prediction
+        depth = torch.clip(depth, -1.0, 1.0)
+        # shift to [0, 1]
+        depth = (depth + 1.0) / 2.0
+
+        return depth
+
+    def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor:
+        """
+        Encode RGB image into latent.
+
+        Args:
+            rgb_in (`torch.Tensor`):
+                Input RGB image to be encoded.
+
+        Returns:
+            `torch.Tensor`: Image latent.
+        """
+        # encode
+        h = self.vae.encoder(rgb_in)
+        moments = self.vae.quant_conv(h)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        # scale latent
+        rgb_latent = mean * self.rgb_latent_scale_factor
+        return rgb_latent
+
+    def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor:
+        """
+        Decode depth latent into depth map.
+
+        Args:
+            depth_latent (`torch.Tensor`):
+                Depth latent to be decoded.
+
+        Returns:
+            `torch.Tensor`: Decoded depth map.
+        """
+        # scale latent
+        depth_latent = depth_latent / self.depth_latent_scale_factor
+        # decode
+        z = self.vae.post_quant_conv(depth_latent)
+        stacked = self.vae.decoder(z)
+        # mean of output channels
+        depth_mean = stacked.mean(dim=1, keepdim=True)
+        return depth_mean
+
+    def forward(self, x):
+        out = self.__call__(x)
+        return out
diff --git a/modelscope/pipelines/cv/image_editing_pipeline.py b/modelscope/pipelines/cv/image_editing_pipeline.py
index 15e21eafb..489fa422a 100644
--- a/modelscope/pipelines/cv/image_editing_pipeline.py
+++ b/modelscope/pipelines/cv/image_editing_pipeline.py
@@ -12,7 +12,7 @@
 
 from modelscope.metainfo import Pipelines
 from modelscope.models.cv.image_editing import (
-    MutualSelfAttentionControl, regiter_attention_editor_diffusers)
+    MutualSelfAttentionControl, register_attention_editor_diffusers)
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.multi_modal.diffusers_wrapped.diffusers_pipeline import \
@@ -97,7 +97,7 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
         start_code = start_code.expand(len(prompts), -1, -1, -1)
         STEP, LAYER = 4, 10
         editor = MutualSelfAttentionControl(STEP, LAYER)
-        regiter_attention_editor_diffusers(self.pipeline, editor)
+        register_attention_editor_diffusers(self.pipeline, editor)
 
         # inference the synthesized image
         output = self.pipeline(
diff --git a/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py b/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py
new file mode 100644
index 000000000..a49ca08d6
--- /dev/null
+++ b/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py
@@ -0,0 +1,122 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_local_feature_matching,
+    module_name=Pipelines.image_local_feature_matching)
+class ImageLocalFeatureMatchingPipeline(Pipeline):
+    r""" Image Local Feature Matching Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+
+    >>> matcher = pipeline(Tasks.image_local_feature_matching,
+    >>>    model='Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data')
+    >>> matcher([['https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching1.jpg',
+    >>>          'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching2.jpg']])
+    >>>   [{
+    >>>    'matches': [array([[720.5    , 187.8    ],
+    >>>        [707.4    , 198.23334],
+    >>>        ...,
+    >>>        [746.7    , 594.7    ],
+    >>>        [759.8    , 594.7    ]], dtype=float32),
+    >>>        array([[652.49744 ,  29.599142],
+    >>>        [639.25287 ,  45.90798 ],
+    >>>        [653.041   ,  43.399014],
+    >>>        ...,
+    >>>        [670.8787  , 547.8298  ],
+    >>>        [608.5573  , 548.97815 ],
+    >>>        [617.82574 , 548.601   ]], dtype=float32),
+    >>>        array([0.25541496, 0.2781789 , 0.20776041, ..., 0.39656195, 0.7202848 ,
+    >>>        0.37208357], dtype=float32)],
+    >>>    'output_img': array([[[255, 255, 255],
+    >>>         [255, 255, 255],
+    >>>         [255, 255, 255],
+    >>>         ...,
+    >>>         [255, 255, 255],
+    >>>         [255, 255, 255],
+    >>>         [255, 255, 255]],
+    >>>         ...,
+    >>>        [[255, 255, 255],
+    >>>         [255, 255, 255],
+    >>>         [255, 255, 255],
+    >>>         ...,
+    >>>         [255, 255, 255],
+    >>>         [255, 255, 255],
+    >>>         [255, 255, 255]]], dtype=uint8)}]
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image local feature matching pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+    def load_image(self, img_name):
+        img = LoadImage.convert_to_ndarray(img_name).astype(np.float32)
+        img = img / 255.
+        # convert rgb to gray
+        if len(img.shape) == 3:
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        H, W = 480, 640
+        h_scale, w_scale = H / img.shape[0], W / img.shape[1]
+        img = cv2.resize(img, (W, H))
+        return img, h_scale, w_scale
+
+    def preprocess(self, input: Input):
+        assert len(input) == 2, 'input should be a list of two images'
+
+        img1, h_scale1, w_scale1 = self.load_image(input[0])
+
+        img2, h_scale2, w_scale2 = self.load_image(input[1])
+
+        img1 = torch.from_numpy(img1)[None][None].cuda().float()
+        img2 = torch.from_numpy(img2)[None][None].cuda().float()
+        return {
+            'image0': img1,
+            'image1': img2,
+            'scale_info': [h_scale1, w_scale1, h_scale2, w_scale2]
+        }
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        matches = results[OutputKeys.MATCHES]
+
+        kpts0 = matches['kpts0'].cpu().numpy()
+        kpts1 = matches['kpts1'].cpu().numpy()
+        conf = matches['conf'].cpu().numpy()
+        scale_info = [v.cpu().numpy() for v in inputs['scale_info']]
+        kpts0[:, 0] = kpts0[:, 0] / scale_info[1]
+        kpts0[:, 1] = kpts0[:, 1] / scale_info[0]
+        kpts1[:, 0] = kpts1[:, 0] / scale_info[3]
+        kpts1[:, 1] = kpts1[:, 1] / scale_info[2]
+
+        outputs = {
+            OutputKeys.MATCHES: [kpts0, kpts1, conf],
+            OutputKeys.OUTPUT_IMG: results[OutputKeys.OUTPUT_IMG]
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/image_matching_fast_pipeline.py b/modelscope/pipelines/cv/image_matching_fast_pipeline.py
new file mode 100644
index 000000000..8af15f721
--- /dev/null
+++ b/modelscope/pipelines/cv/image_matching_fast_pipeline.py
@@ -0,0 +1,105 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, List, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_matching, module_name=Pipelines.image_matching_fast)
+class ImageMatchingFastPipeline(Pipeline):
+    """ Image Matching Pipeline.
+
+    Examples:
+
+    >>> from modelscope.outputs import OutputKeys
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+
+    >>> task = 'image-matching'
+    >>> model_id = 'Damo_XR_Lab/cv_transformer_image-matching_fast'
+
+    >>> input_location =  [[
+    >>>        'data/test/images/image_matching1.jpg',
+    >>>         'data/test/images/image_matching1.jpg',
+    >>>    ]]
+    >>> estimator = pipeline(task, model=model_id)
+    >>> result = estimator(input_location)
+    >>> kpts0, kpts1, confidence = result[0][OutputKeys.MATCHES]
+    >>> print(f'Found {len(kpts0)} matches')
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image matching pipeline fast for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        # check if cuda is available
+        if not torch.cuda.is_available():
+            raise RuntimeError(
+                'Cuda is not available. Image matching model only supports cuda.'
+            )
+
+        logger.info('image matching model, pipeline init')
+
+    def load_image(self, img_name):
+        image_loader = LoadImage(backend='cv2')
+        img = image_loader(img_name)['img']
+        return img
+
+    def preprocess(self, input: Input):
+        assert len(input) == 2, 'input should be a list of two images'
+        img1 = self.load_image(input[0])
+        img2 = self.load_image(input[1])
+
+        return {'image0': img1, 'image1': img2}
+
+    def forward(self, input: Dict[str, Any]) -> list:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: list) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        matches = results[OutputKeys.MATCHES]
+
+        kpts0 = matches['kpts0'].detach().cpu().numpy()
+        kpts1 = matches['kpts1'].detach().cpu().numpy()
+        confidence = matches['confidence'].detach().cpu().numpy()
+
+        outputs = {
+            OutputKeys.MATCHES: [kpts0, kpts1, confidence],
+        }
+
+        return outputs
+
+    def __call__(self, input, **kwargs):
+        """
+        Match two images and return the matched keypoints and confidence.
+
+        Args:
+            input (`List[List[str]]`): A list of two image paths.
+
+        Return:
+            A list of result.
+            The list contain the following values:
+
+            - kpts0 -- Matched keypoints in the first image
+            - kpts1 -- Matched keypoints in the second image
+            - confidence -- Confidence of the match
+        """
+        return super().__call__(input, **kwargs)
diff --git a/modelscope/pipelines/cv/image_normal_estimation_pipeline.py b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py
new file mode 100644
index 000000000..6622a6ee3
--- /dev/null
+++ b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py
@@ -0,0 +1,154 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict, Union
+
+import cv2
+import numpy as np
+import PIL
+import torch
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Model, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.image_normal_estimation,
+    module_name=Pipelines.image_normal_estimation)
+class ImageNormalEstimationPipeline(Pipeline):
+    r""" Image Normal Estimation Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+
+    >>> estimator = pipeline(
+    >>>        Tasks.image_normal_estimation, model='Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal')
+    >>> estimator("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_normal_estimation.jpg")
+    >>>   {
+    >>>    "normals": array([[[0.09233217, 0.07563387, 0.08025375, ..., 0.06992684,
+    >>>                       0.07490329, 0.14308228],
+    >>>                       [0.07833742, 0.06736029, 0.07296766, ..., 0.09184352,
+    >>>                       0.0800755 , 0.09726034],
+    >>>                       [0.07676302, 0.06631223, 0.07067154, ..., 0.09527256,
+    >>>                       0.09292313, 0.08056315],
+    >>>                       ...,
+    >>>                       [0.26432115, 0.29100573, 0.2956126 , ..., 0.2913087 ,
+    >>>                       0.29201347, 0.29539976],
+    >>>                       [0.24557455, 0.26430887, 0.28548756, ..., 0.2877307 ,
+    >>>                       0.28856137, 0.2937242 ],
+    >>>                       [0.26316068, 0.2718169 , 0.28436714, ..., 0.29435217,
+    >>>                       0.29842147, 0.2943223 ]],
+    >>>                      [[0.59257126, 0.6459297 , 0.66572756, ..., 0.68350476,
+    >>>                       0.6882835 , 0.66579086],
+    >>>                       [0.7054596 , 0.6592535 , 0.6728153 , ..., 0.6589912 ,
+    >>>                       0.64541686, 0.63954735],
+    >>>                       [0.6912665 , 0.6638877 , 0.67816293, ..., 0.6607329 ,
+    >>>                       0.6472897 , 0.64633334],
+    >>>                       ...,
+    >>>                       [0.04231769, 0.04427819, 0.04816979, ..., 0.04485315,
+    >>>                       0.04652229, 0.04869233],
+    >>>                       [0.04601872, 0.03706329, 0.04397734, ..., 0.04522909,
+    >>>                       0.04745695, 0.04823782],
+    >>>                       [0.06671816, 0.0520605 , 0.0563788 , ..., 0.04913886,
+    >>>                       0.04974678, 0.04954173]],
+    >>>                      [[0.4338835 , 0.43240184, 0.43519282, ..., 0.36894026,
+    >>>                       0.35207224, 0.33153164],
+    >>>                       [0.4786287 , 0.4399531 , 0.4350407 , ..., 0.34690523,
+    >>>                       0.3179497 , 0.26544768],
+    >>>                       [0.47692937, 0.4416514 , 0.437603  , ..., 0.34660107,
+    >>>                       0.3102659 , 0.27787644],
+    >>>                       ...,
+    >>>                       [0.49566334, 0.48355937, 0.48710674, ..., 0.4964854 ,
+    >>>                       0.48945957, 0.49413157],
+    >>>                       [0.490632  , 0.4706958 , 0.48100013, ..., 0.48724395,
+    >>>                       0.4799561 , 0.48129278],
+    >>>                       [0.49428058, 0.47433382, 0.4823783 , ..., 0.48930234,
+    >>>                       0.48616886, 0.47176325]]], dtype=float32),
+    >>>    'normals_color': array([[[ 23, 151, 110],
+    >>>                             [ 19, 164, 110],
+    >>>                             [ 20, 169, 110],
+    >>>                             ...,
+    >>>                             [ 17, 174,  94],
+    >>>                             [ 19, 175,  89],
+    >>>                             [ 36, 169,  84]],
+    >>>                            [[ 19, 179, 122],
+    >>>                             [ 17, 168, 112],
+    >>>                             [ 18, 171, 110],
+    >>>                             ...,
+    >>>                             [ 23, 168,  88],
+    >>>                             [ 20, 164,  81],
+    >>>                             [ 24, 163,  67]],
+    >>>                            [[ 19, 176, 121],
+    >>>                             [ 16, 169, 112],
+    >>>                             [ 18, 172, 111],
+    >>>                             ...,
+    >>>                             [ 24, 168,  88],
+    >>>                             [ 23, 165,  79],
+    >>>                             [ 20, 164,  70]],
+    >>>                             ...,
+    >>>                            [[ 67,  10, 126],
+    >>>                             [ 74,  11, 123],
+    >>>                             [ 75,  12, 124],
+    >>>                             ...,
+    >>>                             [ 74,  11, 126],
+    >>>                             [ 74,  11, 124],
+    >>>                             [ 75,  12, 126]],
+    >>>                            [[ 62,  11, 125],
+    >>>                             [ 67,   9, 120],
+    >>>                             [ 72,  11, 122],
+    >>>                             ...,
+    >>>                             [ 73,  11, 124],
+    >>>                             [ 73,  12, 122],
+    >>>                             [ 74,  12, 122]],
+    >>>                            [[ 67,  17, 126],
+    >>>                             [ 69,  13, 120],
+    >>>                             [ 72,  14, 123],
+    >>>                             ...,
+    >>>                             [ 75,  12, 124],
+    >>>                             [ 76,  12, 123],
+    >>>                             [ 75,  12, 120]]], dtype=uint8)}
+    """
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image normal estimation pipeline for prediction
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model, **kwargs)
+
+        logger.info('normal estimation model, pipeline init')
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+        img = LoadImage.convert_to_ndarray(input).astype(np.float32)
+        H, W = 384, 384
+        img = cv2.resize(img, [W, H])
+        img = img.transpose(2, 0, 1) / 255.0
+        imgs = img[None, ...]
+        data = {'imgs': imgs}
+
+        return data
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.inference(input)
+        return results
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        results = self.model.postprocess(inputs)
+        normals = results[OutputKeys.NORMALS]
+        if isinstance(normals, torch.Tensor):
+            normals = normals.detach().cpu().squeeze().numpy()
+        normals_color = (np.transpose(normals,
+                                      (1, 2, 0)) * 255).astype(np.uint8)
+        outputs = {
+            OutputKeys.NORMALS: normals,
+            OutputKeys.NORMALS_COLOR: normals_color
+        }
+
+        return outputs
diff --git a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
index 18883171d..72d65cae4 100644
--- a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
+++ b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py
@@ -173,11 +173,13 @@ def sr_process(self, img):
     def preprocess(self, input: Input) -> Dict[str, Any]:
         img = LoadImage.convert_to_ndarray(input)
 
-        img_sr = img
         if self.use_sr:
             img_sr = self.sr_process(img)
-
             img = cv2.resize(img, img_sr.shape[:2][::-1])
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+        else:
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
+            img_sr = img.copy()
 
         result = {'img': img, 'img_sr': img_sr}
         return result
@@ -200,6 +202,9 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
             of, of_112, tfm_inv = warp_and_crop_face(
                 img, facial5points, crop_size=(self.size, self.size))
 
+            of = of[..., ::-1].copy()  # BGR->RGB
+            of_112 = of_112[..., ::-1].copy()  # BGR->RGB
+
             # detect orig face quality
             fq_o, fea_o = self.eqface.get_face_quality(of_112)
             if fq_o < self.fqa_thres:
diff --git a/modelscope/pipelines/cv/image_to_3d_pipeline.py b/modelscope/pipelines/cv/image_to_3d_pipeline.py
new file mode 100644
index 000000000..d74003d6f
--- /dev/null
+++ b/modelscope/pipelines/cv/image_to_3d_pipeline.py
@@ -0,0 +1,140 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path as osp
+from typing import Any, Dict
+
+import cv2
+import numpy as np
+import PIL
+import rembg
+import torch
+import torch.nn.functional as F
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from omegaconf import OmegaConf
+from PIL import Image
+from torchvision.utils import save_image
+
+# import modelscope.models.cv.image_to_image_generation.data as data
+# import modelscope.models.cv.image_to_image_generation.models as models
+# import modelscope.models.cv.image_to_image_generation.ops as ops
+from modelscope.metainfo import Pipelines
+# from modelscope.models.cv.image_to_3d.ldm.models.diffusion.sync_dreamer import \
+#     SyncMultiviewDiffusion
+from modelscope.models.cv.image_to_3d.ldm.util import (add_margin,
+                                                       instantiate_from_config)
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+# from modelscope.models.cv.image_to_3d.model import UNet
+# from modelscope.models.cv.image_to_image_generation.models.clip import \
+#     VisionTransformer
+
+logger = get_logger()
+
+
+# Load Syncdreamer Model
+def load_model(cfg, ckpt, strict=True):
+    config = OmegaConf.load(cfg)
+    model = instantiate_from_config(config.model)
+    print(f'loading model from {ckpt} ...')
+    ckpt = torch.load(ckpt, map_location='cpu')
+    model.load_state_dict(ckpt['state_dict'], strict=strict)
+    model = model.cuda().eval()
+    return model
+
+
+# Prepare Syncdreamer Input
+def prepare_inputs(image_input, elevation_input, crop_size=-1, image_size=256):
+    image_input[:, :, :3] = image_input[:, :, :3][:, :, ::-1]
+    image_input = Image.fromarray(image_input)
+    if crop_size != -1:
+        alpha_np = np.asarray(image_input)[:, :, 3]
+        coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)]
+        min_x, min_y = np.min(coords, 0)
+        max_x, max_y = np.max(coords, 0)
+        ref_img_ = image_input.crop((min_x, min_y, max_x, max_y))
+        h, w = ref_img_.height, ref_img_.width
+        scale = crop_size / max(h, w)
+        h_, w_ = int(scale * h), int(scale * w)
+        ref_img_ = ref_img_.resize((w_, h_), resample=Image.BICUBIC)
+        image_input = add_margin(ref_img_, size=image_size)
+    else:
+        image_input = add_margin(
+            image_input, size=max(image_input.height, image_input.width))
+        image_input = image_input.resize((image_size, image_size),
+                                         resample=Image.BICUBIC)
+
+    image_input = np.asarray(image_input)
+    image_input = image_input.astype(np.float32) / 255.0
+    ref_mask = image_input[:, :, 3:]
+    image_input[:, :, :
+                3] = image_input[:, :, :
+                                 3] * ref_mask + 1 - ref_mask  # white background
+    image_input = image_input[:, :, :3] * 2.0 - 1.0
+    image_input = torch.from_numpy(image_input.astype(np.float32))
+    elevation_input = torch.from_numpy(
+        np.asarray([np.deg2rad(elevation_input)], np.float32))
+    return {'input_image': image_input, 'input_elevation': elevation_input}
+
+
+@PIPELINES.register_module(
+    Tasks.image_to_3d, module_name=Pipelines.image_to_3d)
+class Image23DPipeline(Pipeline):
+
+    def __init__(self, model: str, **kwargs):
+        """
+        use `model` to create a image-to-3d generation pipeline
+        Args:
+            model: model id on modelscope hub.
+        """
+        super().__init__(model=model)
+        config_path = osp.join(self.model, ModelFile.CONFIGURATION)
+        logger.info(f'loading config from {config_path}')
+        self.cfg = Config.from_file(config_path)
+        # print(config_path)
+        if torch.cuda.is_available():
+            self._device = torch.device('cuda')
+        else:
+            self._device = torch.device('cpu')
+        ckpt = config_path.replace('configuration.json',
+                                   'syncdreamer-pretrain.ckpt')
+        self.model = load_model(
+            config_path.replace('configuration.json', 'syncdreamer.yaml'),
+            ckpt).to(self._device)
+        # os.system("pip install -r {}".format(config_path.replace("configuration.json", "requirements.txt")))
+        # assert isinstance(self.model, SyncMultiviewDiffusion)
+
+    def preprocess(self, input: Input) -> Dict[str, Any]:
+
+        result = rembg.remove(Image.open(input))
+        print(type(result))
+        img = np.array(result)
+        img[:, :, :3] = img[:, :, :3][:, :, ::-1]
+        # img = cv2.imread(input)
+        data = prepare_inputs(
+            img, elevation_input=10, crop_size=200, image_size=256)
+
+        for k, v in data.items():
+            data[k] = v.unsqueeze(0).cuda()
+            data[k] = torch.repeat_interleave(
+                data[k], 1, dim=0)  # only one sample
+        return data
+
+    @torch.no_grad()
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        x_sample = self.model.sample(input, 2.0, 8)
+
+        B, N, _, H, W = x_sample.shape
+        x_sample = (torch.clamp(x_sample, max=1.0, min=-1.0) + 1) * 0.5
+        x_sample = x_sample.permute(0, 1, 3, 4, 2).cpu().numpy() * 255
+        x_sample = x_sample.astype(np.uint8)
+        show_in_im2 = [Image.fromarray(x_sample[0, ni]) for ni in range(N)]
+        return {'MViews': show_in_im2}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        return inputs
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index cb7522c0d..5b0fbda5a 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -6,7 +6,6 @@
 
 import cv2
 import numpy as np
-import tensorflow as tf
 import torch
 
 from modelscope.metainfo import Pipelines
@@ -19,18 +18,7 @@
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
-from .ocr_utils import (SegLinkDetector, boxes_from_bitmap, cal_width,
-                        combine_segments_python, decode_segments_links_python,
-                        nms_python, polygons_from_bitmap, rboxes_to_polygons)
-
-if tf.__version__ >= '2.0':
-    import tf_slim as slim
-else:
-    from tensorflow.contrib import slim
-
-if tf.__version__ >= '2.0':
-    tf = tf.compat.v1
-tf.compat.v1.disable_eager_execution()
+from .ocr_utils import cal_width, nms_python, rboxes_to_polygons
 
 logger = get_logger()
 
@@ -39,12 +27,8 @@
 OFFSET_DIM = 6
 WORD_POLYGON_DIM = 8
 OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1]
-
-FLAGS = tf.app.flags.FLAGS
-tf.app.flags.DEFINE_float('node_threshold', 0.4,
-                          'Confidence threshold for nodes')
-tf.app.flags.DEFINE_float('link_threshold', 0.6,
-                          'Confidence threshold for links')
+TF_NODE_THRESHOLD = 0.4
+TF_LINK_THRESHOLD = 0.6
 
 
 @PIPELINES.register_module(
@@ -57,7 +41,7 @@ class OCRDetectionPipeline(Pipeline):
     ```python
     >>> from modelscope.pipelines import pipeline
 
-    >>> ocr_detection = pipeline('ocr_detection', model='damo/cv_resnet18_ocr-detection-line-level_damo')
+    >>> ocr_detection = pipeline('ocr-detection', model='damo/cv_resnet18_ocr-detection-line-level_damo')
     >>> result = ocr_detection('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/ocr_detection.jpg')
 
         {'polygons': array([[220,  14, 780,  14, 780,  64, 220,  64],
@@ -99,6 +83,16 @@ def __init__(self, model: str, **kwargs):
             logger.info('loading model done')
         else:
             # for model seglink++
+            import tensorflow as tf
+
+            if tf.__version__ >= '2.0':
+                tf = tf.compat.v1
+            tf.compat.v1.disable_eager_execution()
+
+            tf.app.flags.DEFINE_float('node_threshold', TF_NODE_THRESHOLD,
+                                      'Confidence threshold for nodes')
+            tf.app.flags.DEFINE_float('link_threshold', TF_LINK_THRESHOLD,
+                                      'Confidence threshold for links')
             tf.reset_default_graph()
             model_path = osp.join(
                 osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
@@ -125,6 +119,7 @@ def __init__(self, model: str, **kwargs):
                         variable_averages = tf.train.ExponentialMovingAverage(
                             0.997, global_step)
 
+                        from .ocr_utils import SegLinkDetector, combine_segments_python, decode_segments_links_python
                         # detector
                         detector = SegLinkDetector()
                         all_maps = detector.build_model(
@@ -198,6 +193,7 @@ def preprocess(self, input: Input) -> Dict[str, Any]:
             result = self.preprocessor(input)
             return result
         else:
+            # for model seglink++
             img = LoadImage.convert_to_ndarray(input)
 
             h, w, c = img.shape
diff --git a/modelscope/pipelines/cv/rife_video_frame_interpolation_pipeline.py b/modelscope/pipelines/cv/rife_video_frame_interpolation_pipeline.py
new file mode 100644
index 000000000..a4892273e
--- /dev/null
+++ b/modelscope/pipelines/cv/rife_video_frame_interpolation_pipeline.py
@@ -0,0 +1,127 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import glob
+import math
+import os
+import os.path as osp
+import subprocess
+import tempfile
+from typing import Any, Dict, Optional, Union
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torchvision.utils import make_grid
+
+from modelscope.metainfo import Pipelines
+from modelscope.models.cv.video_frame_interpolation.rife import RIFEModel
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input, Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.preprocessors import LoadImage
+from modelscope.preprocessors.cv import VideoReader
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile, Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.video_frame_interpolation,
+    module_name=Pipelines.rife_video_frame_interpolation)
+class RIFEVideoFrameInterpolationPipeline(Pipeline):
+    r""" RIFE Video Frame Interpolation Pipeline.
+
+    Examples:
+
+    >>> from modelscope.pipelines import pipeline
+    >>> from modelscope.utils.constant import Tasks
+    >>> from modelscope.outputs import OutputKeys
+
+    >>> video = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/video_frame_interpolation_test.mp4'
+    >>> video_frame_interpolation_pipeline = pipeline(Tasks.video_frame_interpolation,
+    'Damo_XR_Lab/cv_rife_video-frame-interpolation')
+    >>> result = video_frame_interpolation_pipeline(video)[OutputKeys.OUTPUT_VIDEO]
+    >>> print('pipeline: the output video path is {}'.format(result))
+
+    """
+
+    def __init__(self,
+                 model: Union[RIFEModel, str],
+                 preprocessor=None,
+                 **kwargs):
+        super().__init__(model=model, preprocessor=preprocessor, **kwargs)
+        if (isinstance(model, str)):
+            self.model = RIFEModel(model)
+        logger.info('load video frame-interpolation done')
+
+    def preprocess(self, input: Input, out_fps: float = 0) -> Dict[str, Any]:
+        # Determine the input type
+        if isinstance(input, str):
+            video_reader = VideoReader(input)
+        elif isinstance(input, dict):
+            video_reader = VideoReader(input['video'])
+        inputs = []
+        for frame in video_reader:
+            inputs.append(frame)
+        fps = video_reader.fps
+
+        for i, img in enumerate(inputs):
+            img = torch.from_numpy(img.copy()).permute(2, 0, 1).float()
+            inputs[i] = img.unsqueeze(0).to(self.model.device)
+
+        out_fps = 2 * fps
+        return {'video': inputs, 'fps': fps, 'out_fps': out_fps}
+
+    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+        inputs = input['video']
+        # fps = input['fps']
+        out_fps = input['out_fps']
+        video_len = len(inputs)
+
+        h, w = inputs[0].shape[-2:]
+        ph = ((h - 1) // 32 + 1) * 32
+        pw = ((w - 1) // 32 + 1) * 32
+        padding = (0, pw - w, 0, ph - h)
+
+        outputs = []
+        for i in range(video_len):
+            if i == 0:
+                outputs.append(inputs[i])
+            elif i == video_len - 1:
+                outputs.append(inputs[i])
+            else:
+                i0 = F.pad(inputs[i - 1] / 255., padding).to(self.model.device)
+                i1 = F.pad(inputs[i] / 255., padding).to(self.model.device)
+                output = self.model.inference(i0, i1)[:, :, :h, :w]
+                output = output.cpu() * 255
+                torch.cuda.empty_cache()
+                outputs.append(output)
+                outputs.append(inputs[i])
+        return {'output': outputs, 'fps': out_fps}
+
+    def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]:
+        output_video_path = kwargs.get('output_video', None)
+        demo_service = kwargs.get('demo_service', True)
+        if output_video_path is None:
+            output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
+        h, w = inputs['output'][0].shape[-2:]
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(output_video_path, fourcc,
+                                       inputs['fps'], (w, h))
+        for i in range(len(inputs['output'])):
+            img = inputs['output'][i]
+            img = img[0].permute(1, 2, 0).byte().cpu().numpy()
+            video_writer.write(img.astype(np.uint8))
+
+        video_writer.release()
+        if demo_service:
+            assert os.system(
+                'ffmpeg -version') == 0, 'ffmpeg is not installed correctly!'
+            output_video_path_for_web = output_video_path[:-4] + '_web.mp4'
+            convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}'
+            subprocess.call(convert_cmd, shell=True)
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web}
+        else:
+            return {OutputKeys.OUTPUT_VIDEO: output_video_path}
diff --git a/modelscope/pipelines/cv/self_supervised_depth_completion_pipeline.py b/modelscope/pipelines/cv/self_supervised_depth_completion_pipeline.py
new file mode 100644
index 000000000..3f16d8ff0
--- /dev/null
+++ b/modelscope/pipelines/cv/self_supervised_depth_completion_pipeline.py
@@ -0,0 +1,59 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import Any, Dict
+
+from modelscope.metainfo import Pipelines
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+@PIPELINES.register_module(
+    Tasks.self_supervised_depth_completion,
+    module_name=Pipelines.self_supervised_depth_completion)
+class SelfSupervisedDepthCompletionPipeline(Pipeline):
+    """Self Supervise dDepth Completion Pipeline
+    Example:
+
+    ```python
+    >>> from modelscope.pipelines import pipeline
+    >>> model_id = 'Damo_XR_Lab/Self_Supervised_Depth_Completion'
+    >>> data_dir = MsDataset.load(
+            'KITTI_Depth_Dataset',
+            namespace='Damo_XR_Lab',
+            split='test',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD
+        ).config_kwargs['split_config']['test']
+    >>> source_dir = os.path.join(data_dir, 'selected_data')
+    >>> self_supervised_depth_completion = pipeline(Tasks.self_supervised_depth_completion,
+                'Damo_XR_Lab/Self_Supervised_Depth_Completion')
+    >>> result = self_supervised_depth_completion({
+            'model_dir': model_id
+            'source_dir': source_dir
+        })
+        cv2.imwrite('result.jpg', result[OutputKeys.OUTPUT])
+    >>> #
+    ```
+    """
+
+    def __init__(self, model: str, **kwargs):
+
+        super().__init__(model=model, **kwargs)
+        logger.info('load model done')
+
+    def preprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """preprocess, not used at present"""
+        return inputs
+
+    def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """forward"""
+        source_dir = inputs['source_dir']
+        result = self.model.forward(source_dir)
+        return {OutputKeys.OUTPUT: result}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        """postprocess, not used at present"""
+        return inputs
diff --git a/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py b/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py
index 1b791634e..320d83e7e 100644
--- a/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py
+++ b/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py
@@ -36,8 +36,10 @@ def __init__(self, model: str, **kwargs):
                 'data/test/images/vision_efficient_tuning_test_1.png')
             >>> print(f'Output: {result}.')
         """
+        logger.warn(
+            '[NOTE]Do not use this pipeline because the dependencies are too old, '
+            'use https://github.com/modelscope/DiffSynth-Studio instead')
         super().__init__(model=model, **kwargs)
-
         self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.model = self.model.to(self.device)
         self.model.eval()
diff --git a/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
new file mode 100644
index 000000000..f19eddffe
--- /dev/null
+++ b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py
@@ -0,0 +1,103 @@
+from typing import Any, Dict, Union
+
+import torch
+
+from modelscope import AutoModelForCausalLM
+from modelscope.metainfo import Pipelines, Preprocessors
+from modelscope.models.base import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Pipeline
+from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.multi_modal.visual_question_answering_pipeline import \
+    VisualQuestionAnsweringPipeline
+from modelscope.preprocessors import Preprocessor, load_image
+from modelscope.utils.constant import Fields, Frameworks, Tasks
+
+
+@PIPELINES.register_module(
+    Tasks.visual_question_answering, module_name='ovis-vl')
+class VisionChatPipeline(VisualQuestionAnsweringPipeline):
+
+    def __init__(self,
+                 model: Union[Model, str],
+                 preprocessor: Preprocessor = None,
+                 config_file: str = None,
+                 device: str = 'gpu',
+                 auto_collate=True,
+                 **kwargs):
+        # super().__init__
+        self.device_name = device
+        self.framework = Frameworks.torch
+        self._model_prepare = True
+        self._auto_collate = auto_collate
+
+        # ovis
+        torch_dtype = kwargs.get('torch_dtype', torch.float16)
+        multimodal_max_length = kwargs.get('multimodal_max_length', 8192)
+        self.device = 'cuda' if device == 'gpu' else device
+        self.model = AutoModelForCausalLM.from_pretrained(
+            model,
+            torch_dtype=torch_dtype,
+            multimodal_max_length=multimodal_max_length,
+            trust_remote_code=True).to(self.device)
+        self.text_tokenizer = self.model.get_text_tokenizer()
+        self.visual_tokenizer = self.model.get_visual_tokenizer()
+
+    def preprocess(self, inputs: Dict[str, Any]):
+        text = inputs['text']
+        image_path_or_url = inputs['image']
+        image = load_image(image_path_or_url)
+        query = f'<image>\n{text}'
+        _, input_ids, pixel_values = self.model.preprocess_inputs(
+            query, [image])
+        attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id)
+        input_ids = input_ids.unsqueeze(0).to(device=self.model.device)
+        attention_mask = attention_mask.unsqueeze(0).to(
+            device=self.model.device)
+        pixel_values = [
+            pixel_values.to(
+                dtype=self.visual_tokenizer.dtype,
+                device=self.visual_tokenizer.device)
+        ]
+
+        return {
+            'input_ids': input_ids,
+            'pixel_values': pixel_values,
+            'attention_mask': attention_mask
+        }
+
+    def forward(self, inputs: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
+        input_ids = inputs['input_ids']
+        pixel_values = inputs['pixel_values']
+        attention_mask = inputs['attention_mask']
+
+        max_new_tokens = forward_params.get('max_new_tokens', 1024)
+        do_sample = forward_params.get('do_sample', False)
+        top_p = forward_params.get('top_p', None)
+        top_k = forward_params.get('top_k', None)
+        temperature = forward_params.get('temperature', None)
+        repetition_penalty = forward_params.get('repetition_penalty', None)
+        with torch.inference_mode():
+            gen_kwargs = dict(
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                top_p=top_p,
+                top_k=top_k,
+                temperature=temperature,
+                repetition_penalty=repetition_penalty,
+                eos_token_id=self.model.generation_config.eos_token_id,
+                pad_token_id=self.text_tokenizer.pad_token_id,
+                use_cache=True)
+            output_ids = self.model.generate(
+                input_ids,
+                pixel_values=pixel_values,
+                attention_mask=attention_mask,
+                **gen_kwargs)[0]
+        return {'output_ids': output_ids}
+
+    def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
+        output_ids = inputs['output_ids']
+        output = self.text_tokenizer.decode(
+            output_ids, skip_special_tokens=True)
+        return {OutputKeys.TEXT: output}
diff --git a/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py
index f3ff7ccea..59320577d 100644
--- a/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py
+++ b/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py
@@ -6,6 +6,7 @@
 
 import cv2
 import torch
+import torchvision
 from einops import rearrange
 
 from modelscope.metainfo import Pipelines
@@ -75,14 +76,17 @@ def postprocess(self, inputs: Dict[str, Any],
             output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name
             temp_video_file = True
 
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        h, w, c = video[0].shape
-        video_writer = cv2.VideoWriter(
-            output_video_path, fourcc, fps=8, frameSize=(w, h))
-        for i in range(len(video)):
-            img = cv2.cvtColor(video[i], cv2.COLOR_RGB2BGR)
-            video_writer.write(img)
-        video_writer.release()
+        # Ensure video is a list of frames with shape (h, w, c)
+        frames = [torch.from_numpy(frame) for frame in video]
+        # Stack frames along a new dimension to create a 4D tensor (T, H, W, C)
+        imgs_tensor = torch.stack(frames, dim=0)
+
+        torchvision.io.write_video(
+            output_video_path,
+            imgs_tensor,
+            fps=8,
+            video_codec='h264',
+            options={'crf': '10'})
         if temp_video_file:
             video_file_content = b''
             with open(output_video_path, 'rb') as f:
diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
index 0b2ba1996..3205f8b5f 100644
--- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py
@@ -50,6 +50,9 @@ def _sanitize_parameters(self, **pipeline_parameters):
         return pipeline_parameters, pipeline_parameters, pipeline_parameters
 
     def get_sentence_embedding(self, inputs, max_len=None):
+        if (self.model or (self.has_multiple_models and self.models[0])):
+            if not self._model_prepare:
+                self.prepare_model()
         inputs = self.preprocessor.batch_encode(inputs, max_length=max_len)
         sentence_vecs = self.model.forward_sentence_embedding(inputs)
         sentence_vecs = sentence_vecs.detach().tolist()
diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py
index 5cd2dcb16..c46bb46ae 100644
--- a/modelscope/pipelines/nlp/llm_pipeline.py
+++ b/modelscope/pipelines/nlp/llm_pipeline.py
@@ -1,12 +1,13 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
-import os.path as osp
 from contextlib import contextmanager
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
+from threading import Lock
+from typing import Any, Callable, Dict, Generator, Iterator, List, Tuple, Union
 
 import json
+import numpy as np
 import torch
-from transformers import PreTrainedTokenizer
+from transformers import AutoConfig, PreTrainedModel, PreTrainedTokenizer
 
 from modelscope import (AutoModelForCausalLM, AutoTokenizer, Pipeline,
                         snapshot_download)
@@ -14,79 +15,84 @@
 from modelscope.models.base import Model
 from modelscope.models.nlp import ChatGLM2Tokenizer, Llama2Tokenizer
 from modelscope.outputs import OutputKeys
+from modelscope.pipelines.base import Input
 from modelscope.pipelines.builder import PIPELINES
 from modelscope.pipelines.util import is_model, is_official_hub_path
 from modelscope.utils.config import Config
-from modelscope.utils.constant import Invoke, ModelFile, Tasks
+from modelscope.utils.constant import Frameworks, Invoke, ModelFile, Tasks
+from modelscope.utils.device import create_device, device_placement
 from modelscope.utils.logger import get_logger
+from modelscope.utils.model_type_helper import ModelTypeHelper
+from modelscope.utils.streaming_output import (PipelineStreamingOutputMixin,
+                                               StreamingOutputMixin,
+                                               add_stream_generate)
 
 logger = get_logger()
 
+SWIFT_MODEL_ID_MAPPING = {}
+SWIFT_FRAMEWORK = 'swift'
 
-class ModelTypeHelper:
 
-    @staticmethod
-    def _get_file_name(model: str, cfg_name: str,
-                       revision: Optional[str]) -> Optional[str]:
-        if osp.exists(model):
-            return osp.join(model, cfg_name)
-        try:
-            return model_file_download(model, cfg_name, revision=revision)
-        except Exception:
-            return None
+class LLMAdapterRegistry:
 
-    @staticmethod
-    def _parse_and_get(file: Optional[str], pattern: str) -> Optional[str]:
-        if file is None or not osp.exists(file):
-            return None
-        return Config.from_file(file).safe_get(pattern)
+    llm_format_map = {'qwen': [None, None, None]}
 
     @classmethod
-    def _get(cls, model: str, revision: Optional[str]) -> Optional[str]:
-        cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision)
-        hf_cfg_file = cls._get_file_name(model, ModelFile.CONFIG, revision)
-        cfg_model_type = cls._parse_and_get(cfg_file, 'model.type')
-        hf_cfg_model_type = cls._parse_and_get(hf_cfg_file, 'model_type')
-        return cfg_model_type or hf_cfg_model_type
+    def _add_to_map(cls, model_type: str, value_index: int = 0, member=None):
+        assert model_type or ModelTypeHelper.current_model_type
+        if model_type is None:
+            model_type = ModelTypeHelper.current_model_type
+        if model_type not in cls.llm_format_map:
+            cls.llm_format_map[model_type] = [None, None, None]
+        assert cls.llm_format_map[model_type][value_index] is None
+        cls.llm_format_map[model_type][value_index] = member
+        return member
 
     @classmethod
-    def _get_adapter(cls, model: str,
-                     revision: Optional[str]) -> Optional[str]:
-        cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision)
-        model = cls._parse_and_get(cfg_file, 'adapter_cfg.model_id_or_path')
-        revision = cls._parse_and_get(cfg_file, 'adapter_cfg.model_revision')
-        return None if model is None else cls._get(model, revision)
+    def _wrapper(cls, model_type: str, value_index: int = 0, member=None):
+        if member is not None:
+            return cls._add_to_map(model_type, value_index, member)
+
+        def _register(member):
+            return cls._add_to_map(model_type, value_index, member)
+
+        return _register
 
     @classmethod
-    def get(cls,
-            model: str,
-            revision: Optional[str] = None,
-            with_adapter: bool = False,
-            split: Optional[str] = None) -> Optional[str]:
-        model_type = cls._get(model, revision)
-        if model_type is None and with_adapter:
-            model_type = cls._get_adapter(model, revision)
-        if model_type is None:
-            return None
-        model_type = model_type.lower()
-        if split is None:
-            return model_type
-        return model_type.split(split)[0]
+    def register_format_messages(cls, model_type: str = None, function=None):
+        return cls._wrapper(model_type, 0, function)
+
+    @classmethod
+    def register_format_output(cls, model_type: str = None, function=None):
+        return cls._wrapper(model_type, 1, function)
+
+    @classmethod
+    def register_tokenizer(cls, model_type: str = None, tokenizer_class=None):
+        return cls._wrapper(model_type, 2, tokenizer_class)
+
+    @classmethod
+    def contains(cls, model_name: str) -> bool:
+        return model_name in cls.llm_format_map
+
+    @classmethod
+    def get(cls, model_name: str) -> bool:
+        return cls.llm_format_map[model_name]
 
 
 @PIPELINES.register_module(Tasks.chat, module_name='llm')
 @PIPELINES.register_module(Tasks.text_generation, module_name='llm')
-class LLMPipeline(Pipeline):
+class LLMPipeline(Pipeline, PipelineStreamingOutputMixin):
 
     def initiate_single_model(self, model):
+        from swift import Swift
+
         if isinstance(model, str):
             logger.info(f'initiate model from {model}')
         if self._is_swift_model(model):
             if self.llm_framework is not None:
                 logger.warning(
-                    f'Cannot using swift with llm_framework, ignoring {self.llm_framework}.'
+                    f'Cannot use swift with llm_framework, ignoring {self.llm_framework}.'
                 )
-            from swift import Swift
 
             base_model = self.cfg.safe_get('adapter_cfg.model_id_or_path')
             assert base_model is not None, 'Cannot get adapter_cfg.model_id_or_path from configuration.json file.'
@@ -104,12 +110,20 @@ def initiate_single_model(self, model):
 
         if isinstance(model, str) and is_official_hub_path(model):
             logger.info(f'initiate model from location {model}.')
-            if self.llm_framework is not None:
+            if self.llm_framework:
                 model_dir = model if os.path.exists(
                     model) else snapshot_download(model)
-                return self._wrap_infer_framework(model_dir,
-                                                  self.llm_framework)
-            elif is_model(model):
+                try:
+                    model = self._wrap_infer_framework(model_dir,
+                                                       self.llm_framework)
+                    logger.info(f'initiate model with {self.llm_framework}.')
+                    return model
+                except Exception as e:
+                    logger.warning(
+                        f'Cannot using llm_framework with {model}, '
+                        f'ignoring llm_framework={self.llm_framework} : {e}')
+                    self.llm_framework = None
+            if is_model(model):
                 return Model.from_pretrained(
                     model,
                     invoked_by=Invoke.PIPELINE,
@@ -142,7 +156,8 @@ def _is_swift_model(self, model: Union[str, Any]) -> bool:
                 return False
 
         self.cfg = Config.from_file(cfg_file)
-        return self.cfg.safe_get('adapter_cfg.tuner_backend') == 'swift'
+        return self.cfg.safe_get(
+            'adapter_cfg.tuner_backend') == SWIFT_FRAMEWORK
 
     def _wrap_infer_framework(self, model_dir, framework='vllm'):
         from modelscope.pipelines.accelerate.base import InferFramework
@@ -157,26 +172,40 @@ def __init__(self,
                  **kwargs):
         self.device_map = kwargs.pop('device_map', None)
         self.llm_framework = llm_framework
-        # TODO: qwen-int4 need 'cuda'/'auto' device_map.
-        if not self.device_map and 'qwen' in kwargs['model'].lower():
-            self.device_map = 'cuda'
+
+        if os.path.exists(kwargs['model']):
+            config = AutoConfig.from_pretrained(
+                kwargs['model'], trust_remote_code=True)
+            q_config = config.__dict__.get('quantization_config', None)
+            if q_config:
+                if q_config.get(
+                        'quant_method',
+                        'gptq') == 'gptq' and torch.cuda.device_count():
+                    self.device_map = 'cuda'
+
         self.torch_dtype = kwargs.pop('torch_dtype', None)
         self.ignore_file_pattern = kwargs.pop('ignore_file_pattern', None)
+
+        if llm_framework == SWIFT_FRAMEWORK:
+            self._init_swift(kwargs['model'], kwargs.get('device', 'gpu'))
+            return
         with self._temp_configuration_file(kwargs):
             super().__init__(*args, **kwargs)
+        if isinstance(self.model, PreTrainedModel):
+            self.model = add_stream_generate(self.model)
 
         tokenizer_class = None
         if isinstance(format_messages, str):
-            assert format_messages in LLM_FORMAT_MAP, \
+            assert LLMAdapterRegistry.contains(format_messages), \
                 f'Can not find function for `{format_messages}`!'
-            format_messages, format_output, tokenizer_class = LLM_FORMAT_MAP[
-                format_messages]
+            format_messages, format_output, tokenizer_class = \
+                LLMAdapterRegistry.get(format_messages)
 
         if format_messages is None:
             model_type = ModelTypeHelper.get(self.model.model_dir, split='-')
-            if model_type in LLM_FORMAT_MAP:
-                format_messages, format_output, tokenizer_class = LLM_FORMAT_MAP[
-                    model_type]
+            if LLMAdapterRegistry.contains(model_type):
+                format_messages, format_output, tokenizer_class = \
+                    LLMAdapterRegistry.get(model_type)
 
         if format_messages is not None:
             self.format_messages = format_messages
@@ -185,6 +214,73 @@ def __init__(self,
         self.tokenizer = self._get_tokenizer(
             tokenizer_class) if tokenizer is None else tokenizer
 
+    def _init_swift(self, model_id, device) -> None:
+        from swift.llm import prepare_model_template
+        from swift.llm.utils import MODEL_MAPPING, InferArguments
+
+        global SWIFT_MODEL_ID_MAPPING
+        if not SWIFT_MODEL_ID_MAPPING:
+            SWIFT_MODEL_ID_MAPPING = {
+                v['model_id_or_path']: k
+                for k, v in MODEL_MAPPING.items()
+            }
+
+        def format_messages(messages: Dict[str, List[Dict[str, str]]],
+                            tokenizer: PreTrainedTokenizer,
+                            **kwargs) -> Dict[str, torch.Tensor]:
+            inputs, _ = self.template.encode(get_example(messages))
+            inputs.pop('labels', None)
+            if 'input_ids' in inputs:
+                input_ids = torch.tensor(inputs['input_ids'])[None]
+                inputs['input_ids'] = input_ids
+                token_len = input_ids.shape[1]
+            if 'inputs_embeds' in inputs:
+                inputs_embeds = inputs['inputs_embeds'][None]
+                inputs['inputs_embeds'] = inputs_embeds
+                token_len = inputs_embeds.shape[1]
+            inputs['attention_mask'] = torch.ones(token_len)[None]
+            if 'token_type_ids' in inputs:
+                inputs['token_type_ids'] = torch.tensor(
+                    inputs['token_type_ids'])[None]
+            return inputs
+
+        def get_example(
+                messages: Dict[str, List[Dict[str, str]]]) -> Dict[str, str]:
+            messages = messages['messages']
+            assert len(messages) > 0, 'messages cannot be empty!'
+            system = None
+            if messages[0]['role'] == 'system':
+                system = messages[0]['content']
+                messages = messages[1:]
+            assert len(messages) % 2 == 1, 'Unsupported messages format!'
+            contents = [message['content'] for message in messages]
+            prompt = contents[-1]
+            history = list(zip(contents[::2], contents[1::2]))
+            if self.llm_framework == SWIFT_FRAMEWORK:
+                return dict(system=system, query=prompt, history=history)
+            else:
+                return dict(system=system, prompt=prompt, history=history)
+
+        assert model_id in SWIFT_MODEL_ID_MAPPING,\
+            f'Invalid model id {model_id} or Swift framework does not support this model.'
+        args = InferArguments(model_type=SWIFT_MODEL_ID_MAPPING[model_id])
+        model, template = prepare_model_template(
+            args, device_map=self.device_map)
+        self.model = add_stream_generate(model)
+        template.model = self.model
+        self.template = template
+        self.tokenizer = template.tokenizer
+        self.format_messages = format_messages
+
+        self.has_multiple_models = False
+        self.framework = Frameworks.torch
+        self.device_name = device
+        self.device = create_device(device)
+        self._model_prepare = False
+        self._model_prepare_lock = Lock()
+        self._auto_collate = True
+        self._compile = False
+
     @contextmanager
     def _temp_configuration_file(self, kwargs: Dict[str, Any]):
         kwargs['model'] = model = self.initiate_single_model(kwargs['model'])
@@ -203,10 +299,11 @@ def _process_single(self, inputs, *args, **kwargs) -> Dict[str, Any]:
         forward_params = kwargs.get('forward_params', {})
         postprocess_params = kwargs.get('postprocess_params', {})
 
-        is_messages = isinstance(inputs, dict) and 'messages' in inputs
-        tokens = self.preprocess(inputs, is_messages, **preprocess_params)
+        preprocess_params['is_messages'] = postprocess_params['is_messages'] \
+            = isinstance(inputs, dict) and 'messages' in inputs
+        tokens = self.preprocess(inputs, **preprocess_params)
 
-        if self.llm_framework is None:
+        if self.llm_framework in (None, SWIFT_FRAMEWORK):
             # pytorch model
             if hasattr(self.model, 'generate'):
                 outputs = self.model.generate(**tokens, **forward_params)
@@ -219,14 +316,65 @@ def _process_single(self, inputs, *args, **kwargs) -> Dict[str, Any]:
             tokens = [list(tokens['inputs'].flatten().numpy())]
             outputs = self.model(tokens, **forward_params)[0]
 
-        if self.llm_framework is None:
+        if self.llm_framework in (None, SWIFT_FRAMEWORK):
             # pytorch model
             outputs = outputs.tolist()[0][len(tokens['inputs'][0]):]
-        response = self.postprocess(outputs, is_messages, **postprocess_params)
+        response = self.postprocess(outputs, **postprocess_params)
         return response
 
-    def preprocess(self, inputs: Union[str, Dict], is_messages: bool,
-                   **kwargs):
+    def stream_generate(self, inputs: Union[Input, List[Input]], *args,
+                        **kwargs) -> Generator:
+        assert isinstance(self.model, StreamingOutputMixin
+                          ), 'pipeline.model must be StreamingOutputMixin!'
+        if (self.model or (self.has_multiple_models and self.models[0])):
+            if not self._model_prepare:
+                self.prepare_model()
+
+        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(
+            **kwargs)
+        preprocess_params['is_messages'] = postprocess_params['is_messages'] \
+            = isinstance(inputs, dict) and 'messages' in inputs
+
+        if isinstance(inputs, list):
+            model_input_list = [
+                self._preprocess_with_check(i, preprocess_params)
+                for i in inputs
+            ]
+            output = []
+            for ele in model_input_list:
+                output.append(
+                    self._stream_single(ele, forward_params,
+                                        postprocess_params))
+        else:
+            model_input = self._preprocess_with_check(inputs,
+                                                      preprocess_params)
+            output = self._stream_single(model_input, forward_params,
+                                         postprocess_params)
+        return output
+
+    def _stream_single(self, model_input: Dict[str, Any],
+                       forward_params: Dict[str, Any],
+                       postprocess_params: Dict[str, Any]) -> Generator:
+
+        with device_placement(self.framework, self.device_name):
+            if self.framework == Frameworks.torch:
+                with torch.no_grad():
+                    if self._auto_collate:
+                        model_input = self._collate_fn(model_input)
+                    stream = self.model.stream_generate(
+                        **model_input, **forward_params)
+            else:
+                stream = self.model.stream_generate(**model_input,
+                                                    **forward_params)
+
+            for out in stream:
+                out = out.tolist()[0][len(model_input['inputs'][0]):]
+                out = self.postprocess(out, **postprocess_params)
+                self._check_output(out)
+                yield out
+
+    def preprocess(self, inputs: Union[str, Dict], **kwargs):
+        is_messages = kwargs.pop('is_messages')
         if is_messages:
             tokens = self.format_messages(inputs, self.tokenizer, **kwargs)
         else:
@@ -244,13 +392,16 @@ def preprocess(self, inputs: Union[str, Dict], is_messages: bool,
         else:
             raise ValueError('model does not have `device` attribute!')
         return {
-            k: (v.to(device) if isinstance(v, torch.Tensor) else v)
+            k: (v.to(device) if torch.is_tensor(v) else v)
             for k, v in tokens.items()
         }
 
-    def postprocess(self, outputs, is_messages: bool, **kwargs):
-
+    def postprocess(self, outputs, **kwargs):
+        is_messages = kwargs.pop('is_messages')
         if not isinstance(outputs, str):
+            shape_type = (torch.Tensor, np.ndarray)
+            if isinstance(outputs, shape_type) and len(outputs.shape) > 1:
+                outputs = outputs[0]
             response = self.tokenizer.decode(
                 outputs, skip_special_tokens=True, **kwargs)
         else:
@@ -356,6 +507,7 @@ def _concat(ids: List[int], *args: Union[int, List[int]]) -> List[int]:
         return ids
 
 
+@LLMAdapterRegistry.register_format_messages('chatglm2')
 def chatglm2_format_messages(messages, tokenizer, **kwargs):
 
     def build_chatglm2_prompt(messages, **kwargs):
@@ -376,6 +528,8 @@ def build_chatglm2_prompt(messages, **kwargs):
     return tokenizer(prompt, return_token_type_ids=False, return_tensors='pt')
 
 
+@LLMAdapterRegistry.register_format_output('chatglm')
+@LLMAdapterRegistry.register_format_output('chatglm2')
 def chatglm2_format_output(response, **kwargs):
     response = response.strip()
     response = response.replace('[[训练时间]]', '2023年')
@@ -386,6 +540,8 @@ def chatglm2_format_output(response, **kwargs):
     return outputs
 
 
+@LLMAdapterRegistry.register_format_messages('llama')
+@LLMAdapterRegistry.register_format_messages('llama2')
 def llama2_format_messages(messages, tokenizer, **kwargs):
     from transformers import BatchEncoding
 
@@ -437,6 +593,8 @@ def build_llama2_prompt(messages, tokenizer, **kwargs):
     return BatchEncoding({'input_ids': tokens})
 
 
+@LLMAdapterRegistry.register_format_messages('baichuan')
+@LLMAdapterRegistry.register_format_messages('baichuan2')
 def baichuan_format_messages(messages, tokenizer, **kwargs):
     from transformers import BatchEncoding
 
@@ -490,6 +648,7 @@ def _parse_messages(messages, split_role='user'):
     return BatchEncoding({'input_ids': input_tokens})
 
 
+@LLMAdapterRegistry.register_format_messages('wizardlm')
 def wizardlm_format_messages(messages, tokenizer, **kwargs):
 
     def build_wizardlm_prompt(messages, tokenizer, **kwargs):
@@ -520,6 +679,7 @@ def build_wizardlm_prompt(messages, tokenizer, **kwargs):
     return tokenizer(prompts, return_token_type_ids=False, return_tensors='pt')
 
 
+@LLMAdapterRegistry.register_format_messages('wizardcode')
 def wizardcode_format_messages(messages, tokenizer, **kwargs):
     messages = messages['messages']
     assert len(messages) == 2, 'wizard code only support two messages.'
@@ -542,6 +702,7 @@ def wizardcode_format_messages(messages, tokenizer, **kwargs):
     return inputs
 
 
+@LLMAdapterRegistry.register_format_messages('chatglm')
 def chatglm3_format_messages(messages, tokenizer, **kwargs):
     messages = messages['messages']
     query, history = messages[-1]['content'], messages[:-1]
@@ -555,15 +716,14 @@ def chatglm3_format_messages(messages, tokenizer, **kwargs):
     return inputs
 
 
-LLM_FORMAT_MAP = {
-    'chatglm2':
-    (chatglm2_format_messages, chatglm2_format_output, ChatGLM2Tokenizer),
-    'qwen': (LLMPipeline.format_messages, LLMPipeline.format_output, None),
-    'llama2': (llama2_format_messages, None, Llama2Tokenizer),
-    'llama': (llama2_format_messages, None, Llama2Tokenizer),
-    'baichuan': (baichuan_format_messages, None, None),
-    'baichuan2': (baichuan_format_messages, None, None),
-    'wizardlm': (wizardlm_format_messages, None, None),
-    'wizardcode': (wizardcode_format_messages, None, None),
-    'chatglm': (chatglm3_format_messages, chatglm2_format_output, None),
-}
+@LLMAdapterRegistry.register_format_messages('qwen2')
+def qwen2_format_messages(messages, tokenizer, **kwargs):
+    messages = messages['messages']
+    text = tokenizer.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True)
+    return tokenizer([text], return_tensors='pt')
+
+
+LLMAdapterRegistry.register_tokenizer('chatglm2', ChatGLM2Tokenizer)
+LLMAdapterRegistry.register_tokenizer('llama', Llama2Tokenizer)
+LLMAdapterRegistry.register_tokenizer('llama2', Llama2Tokenizer)
diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
index 7c064f579..7e281a0ad 100644
--- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py
+++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py
@@ -62,7 +62,7 @@ def __init__(self,
             self.preprocessor = TableQuestionAnsweringPreprocessor(
                 self.model.model_dir, **kwargs)
 
-        # initilize tokenizer
+        # initialize tokenizer
         self.tokenizer = BertTokenizer(
             os.path.join(self.model.model_dir, ModelFile.VOCAB_FILE))
 
diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
index dc4bc40a4..9fa5a2a8c 100644
--- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py
+++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py
@@ -80,7 +80,8 @@ def postprocess(self, inputs: Dict[str, Tensor],
 
         sc_tensor = inputs['predictions']
         if isinstance(sc_tensor, list):
-            sc_tensor = sc_tensor[0]
+            if isinstance(sc_tensor[0], list):
+                sc_tensor = sc_tensor[0]
         sc_sent = self.vocab.string(
             sc_tensor, extra_symbols_to_ignore={self.vocab.pad()})
         sc_sent = (sc_sent + ' ').replace('##', '').rstrip()
diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py
index 1015d3112..55eaf8091 100644
--- a/modelscope/pipelines/nlp/text_generation_pipeline.py
+++ b/modelscope/pipelines/nlp/text_generation_pipeline.py
@@ -18,9 +18,12 @@
 from modelscope.utils.chinese_utils import remove_space_between_chinese_chars
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.hub import Config, read_config
+from modelscope.utils.logger import get_logger
 from modelscope.utils.streaming_output import PipelineStreamingOutputMixin
 from modelscope.utils.torch_utils import is_on_same_device
 
+logger = get_logger()
+
 __all__ = [
     'TextGenerationPipeline', 'TextGenerationT5Pipeline',
     'ChatGLM6bTextGenerationPipeline', 'ChatGLM6bV2TextGenerationPipeline',
@@ -86,14 +89,24 @@ def __init__(self,
             self.postprocessor = cfg.get('postprocessor')
         if self.postprocessor is None:
             self.postprocessor = 'decode'
+        self.has_logged = False
 
     def _sanitize_parameters(self, **pipeline_parameters):
         return {}, pipeline_parameters, {}
 
-    def forward(self, inputs: Dict[str, Any],
+    def forward(self, inputs: Union[Dict[str, Any], Tensor],
                 **forward_params) -> Dict[str, Any]:
         with torch.no_grad():
-            return self.model.generate(inputs, **forward_params)
+            try:
+                return self.model.generate(inputs, **forward_params)
+            except AttributeError as e:
+                if not self.has_logged:
+                    logger.warning(
+                        'When inputs are passed directly, '
+                        f'the error is {e}, '
+                        'which can be ignored if it runs correctly.')
+                    self.has_logged = True
+                return self.model.generate(**inputs, **forward_params)
 
     def decode(self, inputs) -> str:
         return self.preprocessor.decode(
@@ -451,7 +464,7 @@ def forward(self, prompt: str, **forward_params) -> Dict[str, Any]:
             padding=True,
             truncation=True,
             max_length=1024)
-        input_ids = input_ids.input_ids.cuda()
+        input_ids = input_ids.input_ids.to(self.model.device)
         outputs = self.model.generate(
             input_ids, num_beams=4, do_sample=False, max_new_tokens=256)
         decoded_sentences = self.tokenizer.batch_decode(
diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py
index 8750cd3bf..7e1dfd057 100644
--- a/modelscope/pipelines/nlp/translation_pipeline.py
+++ b/modelscope/pipelines/nlp/translation_pipeline.py
@@ -51,14 +51,12 @@ def __init__(self, model: Model, **kwargs):
 
         self._src_vocab_path = osp.join(
             model, self.cfg['dataset']['src_vocab']['file'])
-        self._src_vocab = dict([
-            (w.strip(), i) for i, w in enumerate(open(self._src_vocab_path))
-        ])
+        self._src_vocab = dict([(w.strip(), i) for i, w in enumerate(
+            open(self._src_vocab_path, encoding='utf-8'))])
         self._trg_vocab_path = osp.join(
             model, self.cfg['dataset']['trg_vocab']['file'])
-        self._trg_rvocab = dict([
-            (i, w.strip()) for i, w in enumerate(open(self._trg_vocab_path))
-        ])
+        self._trg_rvocab = dict([(i, w.strip()) for i, w in enumerate(
+            open(self._trg_vocab_path, encoding='utf-8'))])
 
         tf_config = tf.ConfigProto(allow_soft_placement=True)
         tf_config.gpu_options.allow_growth = True
@@ -81,7 +79,7 @@ def __init__(self, model: Model, **kwargs):
             self._tok = MosesTokenizer(lang=self._src_lang)
         self._detok = MosesDetokenizer(lang=self._tgt_lang)
 
-        self._bpe = apply_bpe.BPE(open(self._src_bpe_path))
+        self._bpe = apply_bpe.BPE(open(self._src_bpe_path, encoding='utf-8'))
 
         # model
         output = self.model(self.input_wids)
diff --git a/modelscope/preprocessors/ofa/asr.py b/modelscope/preprocessors/ofa/asr.py
index 5d36b829c..da953299d 100644
--- a/modelscope/preprocessors/ofa/asr.py
+++ b/modelscope/preprocessors/ofa/asr.py
@@ -56,7 +56,7 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
     def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         speed = random.choice([0.9, 1.0, 1.1])
         audio_bytes = self.get_audio_bytes(data[self.column_map['wav']])
-        wav, sr = librosa.load(audio_bytes, 16000, mono=True)
+        wav, sr = librosa.load(audio_bytes, sr=16000, mono=True)
         fbank = self.prepare_fbank(
             torch.tensor([wav], dtype=torch.float32),
             sr,
@@ -94,7 +94,7 @@ def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
     def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]:
         speed = 1.0
         audio_bytes = self.get_audio_bytes(data[self.column_map['wav']])
-        wav, sr = librosa.load(audio_bytes, 16000, mono=True)
+        wav, sr = librosa.load(audio_bytes, sr=16000, mono=True)
         fbank = self.prepare_fbank(
             torch.tensor([wav], dtype=torch.float32),
             sr,
diff --git a/modelscope/preprocessors/templates/__init__.py b/modelscope/preprocessors/templates/__init__.py
new file mode 100644
index 000000000..5ac1780df
--- /dev/null
+++ b/modelscope/preprocessors/templates/__init__.py
@@ -0,0 +1,2 @@
+from .base import Template, get_template
+from .template import TemplateType
diff --git a/modelscope/preprocessors/templates/base.py b/modelscope/preprocessors/templates/base.py
new file mode 100644
index 000000000..4504a4bc7
--- /dev/null
+++ b/modelscope/preprocessors/templates/base.py
@@ -0,0 +1,1041 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import re
+from copy import deepcopy
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from modelscope import get_logger
+from torch.nn import Module
+from torch.nn.utils.rnn import pad_sequence
+from transformers import PreTrainedTokenizerBase, StoppingCriteria
+from .loss_scale import loss_scale_map
+from .tools_prompt import get_tools_prompt
+from .utils import load_batch, load_image, rescale_image, fetch_one, to_device, decode_base64
+from .utils import History, Prompt, StopWords, Context, Messages
+
+logger = get_logger()
+
+DEFAULT_SYSTEM = 'You are a helpful assistant.'
+
+TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
+
+
+def get_template(
+    template_type: str,
+    tokenizer: PreTrainedTokenizerBase,
+    default_system: Optional[str] = None,
+    max_length: Optional[int] = None,
+    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+    **kwargs,
+) -> 'Template':
+    template_info = TEMPLATE_MAPPING[template_type]
+    template = deepcopy(template_info['template'])
+    template.init_template(tokenizer, default_system, max_length, truncation_strategy, **kwargs)
+    return template
+
+
+def _findall(token_list: List[int], sub_token_list: Union[int, List[int]]) -> List[int]:
+    """Find the index of a token in the token_list."""
+    if isinstance(sub_token_list, int):
+        sub_token_list = [sub_token_list]
+    res = []
+    idx = -1
+    try:
+        while True:
+            idx = token_list.index(sub_token_list[0], idx + 1)
+            if len(sub_token_list) == 1 or sub_token_list == token_list[idx:idx + len(sub_token_list)]:
+                res.append(idx)
+    except ValueError:
+        pass
+    return res
+
+
+def replace_img_tag(messages: Messages,
+                    replace_token: str,
+                    pattern=r'<img>(.+?)</img>') -> Tuple[str, History, List[str]]:
+    images_path = []
+    new_messages = []
+    for i, m in enumerate(messages):
+        m = m.copy()
+        if m['content'] is None or m['role'] in ('tool', 'system', 'assistant'):
+            new_messages.append(m)
+        else:
+            images_path += re.findall(pattern, m['content'])
+            m['content'] = re.sub(pattern, replace_token, m['content'])
+            new_messages.append(m)
+    return messages, images_path
+
+
+class StopWordsCriteria(StoppingCriteria):
+    """Adding extra stop words in template to prevent unstoppable generation
+        Like suffixes and chat seps in the template.
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_words: StopWords, **tokenizer_kwargs) -> None:
+        self.tokenizer = tokenizer
+        self.stop_words = stop_words
+        self.tokenizer_kwargs = tokenizer_kwargs
+        self.start_idx = -1
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, **kwargs) -> bool:
+        if self.start_idx == -1:
+            self.start_idx = len(input_ids[0]) - 1
+        tokenizer = self.tokenizer
+        stop_words = self.stop_words
+        # [-20:]: Assuming the end tokens do not exceed 20 tokens,
+        #   to avoid input_ids being too long and affecting efficiency.
+        text = tokenizer.decode(input_ids[0, self.start_idx:][-20:], **self.tokenizer_kwargs)
+        for stop_word in stop_words:
+            if isinstance(stop_word, str):
+                if stop_word in text:
+                    return True
+            else:  # list
+                if len(stop_word) > 0 and input_ids[0].tolist()[-len(stop_word):] == stop_word:
+                    return True
+        return False
+
+
+class Template:
+    """A template class for all supported models.
+
+    Args:
+        prefix: Prefix tokens before the first turn's prompt
+        prompt: A list of elements whose types are str and list of integers. The input query part of every turn.
+        chat_sep: The chat separators between every turn.
+        suffix: The end tokens after the chat finished.
+        default_system: A default system instruction.
+        system_prefix: The prefix if the `system` is not empty.
+        auto_add_bos: By default, the bos_token is not added. The auto_add_bos option will determine
+            whether to add it based on `tokenizer.encode('')`.
+        tools_prompt: The tools prompt name
+        tool_prompt: The tool prompt, usually useful when there is a tool role
+        padding_side: The padding side
+        infer_media_type: The media type supported by the multi-modals
+        Examples:
+            <start_of_output>system\nYou are a helpful assistant!<end_of_output>\n<bos><start_of_output>Who are you?<end_of_output>\n<start_of_output>assistant:I am a robot<end_of_output>\n<start_of_output>Who are you?<end_of_output>\n<start_of_output>assistant:I am a robot<end_of_output> # noqa
+                                     ----------system------------                                       ---query----                                            --response- -----chatsep-----                 ---query---                                             --response- ----suffix-----
+            ----------------------------system_prefix---------------------------- ---------------------------- prompt -------------------------------------                                  ---------------------------- prompt -------------------------------------
+
+    """
+
+    special_tokens = ['<image>', '<video>', '<audio>', '<bbox>', '<ref-object>']
+    special_keys = ['images', 'videos', 'audios', 'objects']
+    grounding_type = 'norm_1000'
+    image_placeholder = ['<image>']
+    load_medias = True
+    compute_per_round_loss = True  # for rlhf
+    output_prompt_answer = False  # for encoder-decoder & kto
+
+    def __init__(self,
+                 prefix: Prompt,
+                 prompt: Prompt,
+                 chat_sep: Optional[Prompt],
+                 suffix: Prompt,
+                 default_system: Optional[str] = None,
+                 system_prefix: Optional[Prompt] = None,
+                 auto_add_bos: bool = False,
+                 tools_prompt: str = 'react_en',
+                 tool_prompt: Optional[Prompt] = None,
+                 padding_side: Literal['left', 'right'] = 'right',
+                 infer_media_type: Literal['interleave', 'dialogue', 'round'] = 'interleave') -> None:
+        # check
+        for x in [prefix, prompt, chat_sep, suffix, system_prefix]:
+            assert x is None or isinstance(x, list)
+
+        if default_system == '':
+            default_system = None
+        if self._has_system(prefix):
+            assert system_prefix is None, 'The prefix already contains {{SYSTEM}}.'
+            system_prefix = prefix
+            prefix = self._replace_system(prefix)
+        self.prefix = prefix
+        self.system_prefix = system_prefix
+        if self.system_prefix is None and not any(['{{SYSTEM}}' in context for context in prompt]):
+            assert default_system is None, 'The template does not support `system`.'
+        self.prompt = prompt
+        self.chat_sep = chat_sep
+        self.support_multi_round = self.chat_sep is not None
+        self.suffix = suffix
+        self.default_system = default_system
+        self.use_default_system = True
+        self.auto_add_bos = auto_add_bos
+        self._is_init = False
+        self.tools_prompt = tools_prompt
+        self.tool_prompt = tool_prompt if tool_prompt is not None else self.prompt  # default as user
+        self.padding_side = padding_side
+        self.infer_media_type = infer_media_type
+
+    @staticmethod
+    def _replace_system(prefix: Prompt) -> Prompt:
+        """Replace system with the """
+        return [p.replace('{{SYSTEM}}', '') for p in prefix if '{{SYSTEM}}' in p]
+
+    @staticmethod
+    def _has_system(prefix: Prompt) -> bool:
+        return any(['{{SYSTEM}}' in p for p in prefix])
+
+    @staticmethod
+    def token_attr_to_id(tokenizer: PreTrainedTokenizerBase, value: Optional[Prompt]) -> Optional[Prompt]:
+        """Turn `eos_token_id` to token id
+
+        e.g. [['eos_token_id']] -> [[2]]
+        """
+        if value is None:
+            return None
+        res_value = []
+        for v in value:
+            if isinstance(v, list):
+                res_v = []
+                for sub_v in v:
+                    if isinstance(sub_v, str):
+                        sub_v = getattr(tokenizer, sub_v)
+                    res_v.append(sub_v)
+                v = res_v
+            res_value.append(v)
+        return res_value
+
+    def init_template(self,
+                       tokenizer: PreTrainedTokenizerBase,
+                       default_system: Optional[str] = None,
+                       max_length: Optional[int] = None,
+                       truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+                       loss_scale: str = 'default',
+                       rescale_image: int = -1,
+                       **kwargs) -> None:
+        """Init template by a tokenizer
+        Args:
+            tokenizer: The tokenizer to tokenize the sentence
+            default_system: The default system to use if the dataset does not provide one
+            max_length: Max length of the sequence
+            truncation_strategy: The truncation strategy
+            loss_scale: The loss scale function to use
+            rescale_image: Rescale image to reduce memory usage, default `-1` means no limitation
+        """
+        assert self._is_init is False, 'The template has been initialized.'
+        self._is_init = True
+        self.tokenizer = tokenizer
+        self.is_multimodal = getattr(tokenizer, 'is_multimodal', None)
+        # if default_system is None. not change self.default_system
+        if default_system == '':
+            self.default_system = None
+        elif default_system is not None:
+            assert self.system_prefix is not None, (
+                f'The template does not support `system`, template_type: {getattr(self, "template_type", None)}')
+            self.default_system = default_system
+        self.max_length = max_length
+        self.truncation_strategy = truncation_strategy
+        if isinstance(loss_scale, str):
+            self.loss_scale = loss_scale_map.get(loss_scale, None)
+        else:
+            self.loss_scale = loss_scale
+        self.rescale_image = rescale_image
+
+        for key in ['prefix', 'prompt', 'chat_sep', 'suffix', 'system_prefix']:
+            value = getattr(self, key)
+            value = self.token_attr_to_id(tokenizer, value)
+            setattr(self, key, value)
+
+    def post_encode(self, model: Module, data: Any) -> Dict[str, Any]:
+        """This method will be called after data_collator and before the forward
+        Args:
+            data: The `_data` field from the example batch, this field should be packed manually
+        Returns:
+            Any extra fields need to be passed into the model.forward
+        """
+        return {}
+
+    def check_example(self, example: Dict[str, Any]) -> None:
+        """Check example valid"""
+        pass
+
+    def add_default_tags(self, example: Dict[str, Any]) -> None:
+        """Add default tags to example, this is for the multi-modal datasets
+            1. For the round infer_media_type, this method will check the tag equals with the chat round
+            2. Else, this method will try to add tags to the head of the messages
+        Args:
+            example: The input example
+        """
+        messages = example['messages']
+        for media_key, media_tag in [('videos', '<video>'), ('images', '<image>'), ('audios', '<audio>')]:
+            if example.get(media_key):
+                _messages = [message for message in messages if message['role']!='system']
+                n_round = len(_messages)
+                assert n_round % 2 == 0
+                history = [_messages[i:i+2] for i in range(n_round // 2)]
+                if self.infer_media_type == 'round':
+                    for i, h, m in zip(range(n_round // 2), history, example[media_key]):
+                        num_media_tags = len(re.findall(media_tag, h[0]['content']))
+                        if m:
+                            assert num_media_tags <= 1, (
+                                'The model includes at most one media per round. However, '
+                                f'this round contains {num_media_tags} media_tags. query: {h[0]}')
+                            if num_media_tags == 0:
+                                h[0]['content'] = media_tag + h[0]['content']
+                        else:
+                            assert num_media_tags == 0, f'Missing media. query: {h[0]}'
+                    example[media_key] = [m for m in example[media_key] if m]
+                else:
+                    num_media_tags = len(re.findall(media_tag, '\n'.join([h[0]['content'] for h in history])))
+                    example[media_key] = [m for m in example[media_key] if m]
+                    num_media = len(example[media_key])
+                    num_new_tags = num_media - num_media_tags
+                    assert num_new_tags >= 0, f'Number of media: {num_media}, number of media_tags: {num_media_tags}'
+                    history[0][0]['content'] = media_tag * num_new_tags + history[0][0]['content']
+
+    def replace_media_tags(self, example) -> None:
+        """Replace the <img></img> with the images key and <image> tag
+
+        Args:
+            example: The input example
+        """
+        # Parse <img></img> format images and merged into images key
+        if self.is_multimodal in {True, None}:  # If False, do not perform replace_img_tag
+            example['messages'], images_path = replace_img_tag(
+                example.get('messages'), '<image>')
+
+            if example.get('images') and images_path:
+                raise ValueError('Do not mix use the <img></img> tag and <image> tag.')
+            example['images'] = example.get('images') or [] + images_path
+
+        # audio, video
+        if self.is_multimodal in {True, None}:
+            for k, tag, pattern in zip(['audios', 'videos'], ['<audio>', '<video>'],
+                                       [r'<audio>(.+?)</audio>', r'<video>(.+?)</video>']):
+                example['messages'], medias_path = replace_img_tag(
+                    example.get('messages'), tag, pattern)
+
+                example[k] = example.get(k) or [] + medias_path
+
+    def _preprocess_media(self, example):
+        """Preprocess multi-modal media resources in one example
+            1. Wrap all values in media keys to list
+            2. Replace <img></img> tags
+            3. Add or check missing tags to examples
+            4. Parse the string field in the `objects` field to jsons
+            5. Load images if needed
+        Args:
+            example: The input example
+        """
+        multimodal_keys = {
+            'audio': 'audios',
+            'image': 'images',
+            'video': 'videos',
+        }
+        # Format media_keys to list
+        for media_key in multimodal_keys.values():
+            if example.get(media_key) and not isinstance(example[media_key], (tuple, list)):
+                # change images field to list
+                example[media_key] = [example[media_key]]
+
+        self.replace_media_tags(example)
+        # Add default tags to examples to note where to put the medias into the sequence
+        self.add_default_tags(example)
+
+        # Format objects(groundings/refs) to json
+        if example.get('objects') and isinstance(example['objects'], str):
+            # reload grounding from str
+            example['objects'] = json.loads(example['objects'])
+            objects = []
+            for object in example['objects']:
+                # Compatible with list format
+                if isinstance(object, list):
+                    object = {
+                        'caption': object[0],
+                        'bbox': object[1],
+                        'bbox_type': None,
+                        'image': 0,
+                    }
+                objects.append(object)
+            example['objects'] = objects
+
+        # Load image into PIL format
+        images = example.get('images') or []
+        if images:
+            if example.get('objects') or self.load_medias:
+                images = load_batch(images, load_image)  # base64/local_path -> PIL.Image
+            if example.get('objects'):
+                # Normalize grounding bboxes
+                self.normalize_bbox(example['objects'], images, to_type=self.grounding_type)
+            if self.load_medias and self.grounding_type != 'real':
+                images = [rescale_image(img, self.rescale_image) for img in images]
+            if not self.load_medias:  # fix pt & qwen-vl
+                images = decode_base64(images=images)['images']  # PIL.Image/base64 -> local_path
+            example['images'] = images
+
+    def preprocess(self, example):
+        # Duplicate example and create a new one to prepare in-place changes
+        example = example.copy()
+        template_type: Optional[str] = getattr(self, 'template_type', None)
+        tools: Union[List[Any], str] = example.get('tools') or []
+
+        # Template needs to be initialized
+        if not self._is_init:
+            raise ValueError(
+                'Template is not initialized, please use the `get_template` function to obtain the template.')
+
+        messages = example['messages']
+        system_round = [message for message in messages if message['role'] == 'system']
+        messages = [message for message in messages if message['role'] != 'system']
+        # Reset system (by default value and agent tools)
+        system: Optional[str] = system_round[0]['content'] if system_round else ''
+        if not system:
+            if self.use_default_system:
+                system = self.default_system
+        else:
+            assert self.system_prefix is not None, (
+                f'The template does not support `system`, template_type: {template_type}')
+        if tools:
+            if isinstance(tools, str):
+                tools = json.loads(tools)
+            if system is None:
+                system = ''
+            system += get_tools_prompt(tools, self.tools_prompt)
+
+        if system:
+            if not system_round:
+                system_round = [{'role': 'system', 'content': None}]
+            system_round[0]['content'] = system
+
+        if len(messages) > 1:
+            assert self.support_multi_round, (
+                f'The template does not support multi-round chat, template_type: {template_type}')
+        example['messages'] = system_round + messages
+        self._preprocess_media(example)
+        # Check the example that whether matching the very template's rules
+        self.check_example(example)
+        return example
+
+    def encode(self, example: Dict[str, Any], streaming: bool = False, is_training: bool = False, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """The entrance method of Template!
+
+        Args:
+            example: The input example
+            streaming: If is streaming mode
+            is_training: Use template in training
+            **kwargs:
+                model: The model instance, use only in `is_training=False`
+        Returns:
+            if not streaming mode, returns tuple of (example, tokenizer_kwargs), else return example only
+        """
+        example = self.preprocess(example)
+        res = self._encode(example, **kwargs)
+        inputs = res[0]
+        if not is_training and '_data' in inputs:
+            model = kwargs.get('model')
+            assert model is not None
+            data = inputs.pop('_data')
+            data = to_device(data, model.device)
+            inputs.update(self.post_encode(model, data))
+        return res if not streaming else inputs
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """return: inputs, tokenizer_kwargs"""
+        messages = example['messages']
+        is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
+
+        inputs, tokenizer_kwargs = self._concat_and_tokenize(
+            messages,
+            self.truncation_strategy,
+            auto_add_bos=self.auto_add_bos,
+            is_multi_modal=is_multi_modal,
+            example=example)
+        if inputs.get('labels') is None:
+            inputs.pop('loss_scale', None)
+        return inputs, tokenizer_kwargs
+
+    def _concat_context_list(
+            self,
+            context_list: List[Context],
+            res_context_list: List[Context],  # inplace
+            loss_scale_list: List[float],  # inplace
+            system: Optional[str] = None,
+            query: Optional[str] = None,
+            response: Optional[str] = None,
+            round0: Optional[int] = None,
+            compute_loss: bool = True) -> None:
+        """Concat context list and replace placeholder"""
+        round1 = None
+        if round0 is not None:
+            round1 = str(round0 + 1)
+            round0 = str(round0)
+        for context in context_list:
+            if isinstance(context, str):
+                if '{{RESPONSE}}' == context:
+                    assert response is not None
+                    if compute_loss:
+                        content_part, weight_part = self.loss_scale(query, response)
+                    else:
+                        content_part, weight_part = [response], [0.]
+                    res_context_list.extend(content_part)
+                    loss_scale_list.extend(weight_part)
+                    continue
+                old_str_list = ['{{SYSTEM}}', '{{QUERY}}', '{{ROUND0}}', '{{ROUND1}}']
+                new_str_list = [system, query, round0, round1]
+                for (old_str, new_str) in zip(old_str_list, new_str_list):
+                    if new_str is not None and old_str in context:
+                        context = context.replace(old_str, new_str)
+            if len(context) == 0:
+                continue
+            res_context_list.append(context)
+            loss_scale_list.append(0.)
+
+    def _simplify_context_list(self, context_list: List[Context], loss_scale_list: List[float],
+                               **kwargs) -> Tuple[List[Context], List[float]]:
+        """Merge anything in the context to simplify the inputs"""
+        is_multi_modal: bool = kwargs.pop('is_multi_modal', False)
+
+        if is_multi_modal:
+            context_list, loss_scale_list = self.split_special_tokens(context_list, loss_scale_list)
+        context_list, loss_scale_list = self.pre_tokenize(context_list, loss_scale_list, **kwargs)
+
+        res: List[Context] = []  # result of context_list
+        res_loss_scale: List[float] = []  # result of loss_scale_list
+        temp: List[str] = []
+        temp_loss_scale = 0.
+        for i, (context, loss_scale) in enumerate(zip(context_list, loss_scale_list)):
+            if isinstance(context, str) and (loss_scale == temp_loss_scale):
+                temp.append(context)
+            else:
+                if len(temp) > 0:
+                    res.append(''.join(temp))
+                    res_loss_scale.append(temp_loss_scale)
+                    temp.clear()
+                if isinstance(context, str):  # loss_scale diff
+                    temp.append(context)
+                else:
+                    res.append(context)
+                    res_loss_scale.append(loss_scale)
+                temp_loss_scale = loss_scale
+        if len(temp) > 0:
+            res.append(''.join(temp))
+            res_loss_scale.append(temp_loss_scale)
+
+        return res, res_loss_scale
+
+    @staticmethod
+    def split_special_tokens(context_list: List[Context],
+                             loss_scale_list: List[float]) -> Tuple[List[Context], List[float]]:
+        """Split special tokens, for example `<image>`, `<video>`, this will help the replace_tag operation"""
+        from .utils import split_str_parts_by
+        res: List[Context] = []
+        loss_scale_res: List[float] = []
+        for context, loss_scale in zip(context_list, loss_scale_list):
+            contexts = []
+            if isinstance(fetch_one(context), str):
+                for d in split_str_parts_by(context, Template.special_tokens):
+                    contexts.extend([d['key'], d['content']])
+                contexts = [c for c in contexts if c]
+                res.extend(contexts)
+                loss_scale_res.extend([loss_scale] * len(contexts))
+            else:
+                res.append(context)
+                loss_scale_res.append(loss_scale)
+        return res, loss_scale_res
+
+    def _tokenize(self, context, **tokenizer_kwargs):
+        return self.tokenizer(
+            context, return_attention_mask=False, add_special_tokens=False, **tokenizer_kwargs)['input_ids']
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        """Override this function to do your own replace operation.
+
+        This method is used to replace standard tags like `<image>` to some tokens that the model needs.
+
+        Args:
+            media_type: The modal.
+            index: The index of the medias, for example 0 represents the first elements in `images`
+            example: The input example
+
+        Returns:
+            The content or input_ids after replacement.
+        """
+        if media_type == 'image':
+            return self.image_placeholder
+        elif media_type == 'video':
+            return ['<video>']
+        elif media_type == 'audio':
+            return ['<audio>']
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        """Replace objects referenced by the bbox to contents or input_ids. This is useful in the grounding task.
+        Override this function to do your own replace operation.
+
+        Args:
+            index: The index in the `objects` key
+            example: The input example
+
+        Returns:
+            The contents or input_ids replaced
+        """
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return [object_['caption']]
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        """Replace bbox pointing to the objects to contents or input_ids. This is useful in the grounding task.
+        Override this function to do your own replace operation.
+
+        Args:
+            index: The index in the `objects` key
+            example: The input example
+
+        Returns:
+            The contents or input_ids replaced
+        """
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            if isinstance(object_['bbox'][0], list):
+                all_objects = ''
+                for sub_object in object_['bbox']:
+                    all_objects += f'[({sub_object[0]},{sub_object[1]}),' f'({sub_object[2]},{sub_object[3]})],'
+                all_objects = all_objects[:-1]
+                return [all_objects]
+            else:
+                return [f'[({object_["bbox"][0]},{object_["bbox"][1]}),({object_["bbox"][2]},{object_["bbox"][3]})]']
+        else:
+            return ['<bbox>']
+
+    @classmethod
+    def normalize_bbox(cls, objects: List[Dict[str, Any]], images: List[Any],
+                       to_type: Literal['real', 'norm_1000', 'norm_1']) -> None:
+        """Normalize bbox to needed.
+        to_type support real/norm_1000/norm_1, which literally means the coordinates in real, or normalized by 1000,
+            or normalized by 1.
+
+        Args:
+            objects: The objects containing the bbox
+            images: The images list
+            to_type: The coordinate type needed by the model.
+        """
+        if not objects or not images:
+            return
+
+        for object in objects:
+            bbox = object['bbox']
+            bbox_type = object['bbox_type']
+            idx = object['image']
+            image = images[idx]
+            if bbox_type == 'real':
+                if to_type == 'real':
+                    continue
+                width, height = image.width, image.height
+                if isinstance(bbox[0], list):
+                    bboxes = []
+                    for _box in bbox:
+                        bboxes.append([
+                            int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
+                            for coord, dim in zip(_box, [width, height, width, height])
+                        ])
+                    object['bbox'] = bboxes
+                else:
+                    object['bbox'] = [
+                        int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
+                        for coord, dim in zip(bbox, [width, height, width, height])
+                    ]
+                object['bbox_type'] = to_type
+            elif bbox_type == 'norm_1000':
+                if to_type == 'norm_1000':
+                    continue
+                if to_type == 'norm_1':
+                    object['bbox'] = [coord / 999. for coord in bbox]
+                elif to_type == 'real':
+                    width, height = image.width, image.height
+                    object['bbox'] = [
+                        int(coord / 999. * dim) for coord, dim in zip(bbox, [width, height, width, height])
+                    ]
+                object['bbox_type'] = to_type
+            elif bbox_type == 'norm_1':
+                if to_type == 'norm_1':
+                    continue
+                if to_type == 'norm_1000':
+                    object['bbox'] = [int(coord * 999) for coord in bbox]
+                elif to_type == 'real':
+                    width, height = image.width, image.height
+                    object['bbox'] = [int(coord * dim) for coord, dim in zip(bbox, [width, height, width, height])]
+                object['bbox_type'] = to_type
+
+    def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
+                     **kwargs) -> Tuple[List[Context], List[float]]:
+        """This method happens before tokenization, replace standard tags to the contents or input_ids needed by
+        the model.
+
+        Args:
+            context_list: The content list
+            loss_scale_list: The loss scale list
+        Returns:
+            The context_list and loss_scale_list after replacement.
+        """
+        example = kwargs.get('example')  # get x_index
+        res: List[Context] = []  # result of context_list
+        res_loss_scale: List[float] = []  # result of loss_scale_list
+
+        for k in ['image', 'video', 'audio']:
+            example[f'{k}_index'] = 0
+
+        for context, loss_scale in zip(context_list, loss_scale_list):
+            for k in ['image', 'video', 'audio']:
+                if context == f'<{k}>':
+                    c_list = self.replace_tag(k, example[f'{k}_index'], example)
+                    example[f'{k}_index'] += 1
+                    break
+            else:
+                if context == '<ref-object>':
+                    c_list = self.replace_object(example.get('object_index', 0), example)
+                    example['object_index'] = example.get('object_index', 0) + 1
+                elif context == '<bbox>':
+                    c_list = self.replace_box(example.get('box_index', 0), example)
+                    example['box_index'] = example.get('box_index', 0) + 1
+                else:
+                    c_list = [context]
+            res += c_list
+            res_loss_scale += [loss_scale] * len(c_list)
+        return res, res_loss_scale
+
+    def _encode_context_list(
+            self,
+            context_list: List[Context],
+            loss_scale_list: Optional[List[float]] = None) -> Tuple[List[int], List[int], List[float], Dict[str, Any]]:
+        """return: input_ids, labels, tokenizer_kwargs"""
+        input_ids: List[int] = []
+        labels: List[int] = []
+        loss_scale: List[float] = []
+        tokenizer_kwargs = {}
+        if loss_scale_list is None:
+            loss_scale_list = [0.] * len(context_list)
+        for i, (context, loss_weight) in enumerate(zip(context_list, loss_scale_list)):
+            if isinstance(context, str):
+                # tokenizer_kwargs is the returned tokenizer_kwargs,
+                # while curr_tokenizer_kwargs is the tokenizer_kwargs for the current context.
+                curr_tokenizer_kwargs = self._get_tokenizer_kwargs(context)
+                self._concat_tokenizer_kwargs(tokenizer_kwargs, curr_tokenizer_kwargs)
+                token_list = self._tokenize(context, **curr_tokenizer_kwargs)
+            else:
+                token_list = context
+            input_ids += token_list
+            if loss_scale_list[i] > 0.0:
+                labels += token_list
+            else:
+                labels += [-100] * len(token_list)
+            loss_scale.extend([loss_weight] * len(token_list))
+        return input_ids, labels, loss_scale, tokenizer_kwargs
+
+    @staticmethod
+    def use_dynamic_eos(labels: List[int], suffix_tokens_id: List[int]) -> None:
+        suffix_len = len(suffix_tokens_id)
+        start = 0
+        for i in range(1, len(labels)):
+            if labels[i - 1] >= 0 and labels[i] == -100:
+                start = i
+            if start > 0 and labels[i - 1] == -100 and labels[i] >= 0:
+                # [0, 1, 2, -100(start), -100, 3(i), 4]
+                length = i - start
+                if length >= suffix_len:
+                    labels[start:start + suffix_len] = suffix_tokens_id
+
+    def _concat_and_tokenize(self,
+                             messages: List[Dict[str, str]],
+                             truncation_strategy: str,
+                             auto_add_bos: bool = False,
+                             **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        return: inputs, tokenizer_kwargs
+        """
+        system = [message for message in messages if message['role'] == 'system']
+        messages = [message for message in messages if message['role'] != 'system']
+        if len(system) > 0:
+            system = system[0]['content']
+        else:
+            system = None
+
+        assert len(messages) >= 1
+        if len(messages) == 1:
+            if messages['role'] == 'response':
+                history = [None, messages['content']]
+                history_roles = [None, messages['role']]
+            else:
+                history = [messages['content'], None]
+                history_roles = [messages['role'], None]
+        else:
+            assert len(messages) % 2 == 0
+            history = [[messages[i]['content'], messages[i+1]['content']] for i in range(len(messages) // 2)]
+            history_roles = [[messages[i]['role'], messages[i + 1]['role']] for i in range(len(messages) // 2)]
+
+        res_context_list: List[Context] = []
+        loss_scale_list: List[float] = []
+        if auto_add_bos:
+            bos_token_id = self.tokenizer.bos_token_id
+            if isinstance(bos_token_id, int) and bos_token_id in self.tokenizer.encode(''):
+                res_context_list.append([bos_token_id])
+                loss_scale_list.append(0.)
+        prompt = self.prompt.copy()
+        if system is None:
+            prompt = [context for context in prompt if '{{SYSTEM}}' not in context]
+        if system is None or any(['{{SYSTEM}}' in context for context in prompt]):
+            prefix = self.prefix
+        else:
+            prefix = self.system_prefix
+        self._concat_context_list(prefix, res_context_list, loss_scale_list, system=system)
+
+        for i, ((q, r), (qr, rr)) in enumerate(zip(history, history_roles)):
+            context_list = self.tool_prompt.copy() if qr == 'tool' else prompt.copy()
+            extra_context_list = []
+            is_suffix = False
+            if i < len(history) - 1:
+                context_list = [context for context in context_list if '{{SYSTEM}}' not in context]
+                context_list.append('{{RESPONSE}}')
+                if history[i + 1][0]:
+                    extra_context_list = self.chat_sep
+            elif r is not None:
+                # last response
+                context_list.append('{{RESPONSE}}')
+                extra_context_list = self.suffix
+                is_suffix = True
+            if q or r:
+                self._concat_context_list(
+                    context_list,
+                    res_context_list,
+                    loss_scale_list,
+                    query=q,
+                    response=r,
+                    system=system,
+                    round0=i,
+                    compute_loss=self.compute_per_round_loss or is_suffix)
+                res_context_list += extra_context_list
+                loss_scale_list += ([1.] if is_suffix else [0.]) * len(extra_context_list)
+        inputs = {}
+        if self.output_prompt_answer:
+            # tokenizer_kwargs: use prompt
+            answer_len = len(extra_context_list) + bool(history[-1][-1] is not None)
+            total_len = len(res_context_list)
+            for key, _slice in zip(['answer', 'prompt'],
+                                   [slice(total_len - answer_len, total_len),
+                                    slice(0, total_len - answer_len)]):
+                _res_context_list, _loss_scale_list = self._simplify_context_list(res_context_list[_slice],
+                                                                                  loss_scale_list[_slice], **kwargs)
+                input_ids, labels, loss_scale, tokenizer_kwargs = self._encode_context_list(
+                    _res_context_list, _loss_scale_list)
+                inputs[f'{key}_input_ids'], inputs[f'{key}_labels'] = input_ids, labels
+                if self.loss_scale:
+                    inputs[f'{key}_loss_scale'] = loss_scale
+            input_ids = inputs['prompt_input_ids'] + inputs['answer_input_ids']
+            labels = inputs['prompt_labels'] + inputs['answer_labels']
+            if history[-1][-1] is None:
+                assert len(inputs['answer_labels']) == 0
+                inputs['answer_labels'] = None
+
+        else:
+            res_context_list, loss_scale_list = self._simplify_context_list(res_context_list, loss_scale_list, **kwargs)
+            input_ids, labels, loss_scale, tokenizer_kwargs = self._encode_context_list(
+                res_context_list, loss_scale_list)
+            if labels is not None:
+                self.use_dynamic_eos(labels, self._encode_context_list(self.suffix)[0])
+
+        if history[-1][-1] is None:
+            labels = None
+
+        if self.max_length is not None:
+            if truncation_strategy == 'delete' and len(input_ids) > self.max_length:
+                logger.warn(f'Current length of row({len(input_ids)}) is larger'
+                            f' than the max_length({self.max_length}), deleted.')
+                return {}, {}
+            input_ids = input_ids[-self.max_length:]
+            if labels is not None:
+                labels = labels[-self.max_length:]
+            if loss_scale is not None:
+                loss_scale = loss_scale[-self.max_length:]
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+
+        if self.loss_scale:
+            inputs['loss_scale'] = loss_scale
+        return inputs, tokenizer_kwargs
+
+    def _get_tokenizer_kwargs(self, context: str) -> Dict[str, Any]:
+        """return: curr_tokenizer_kwargs"""
+        return {}
+
+    def _concat_tokenizer_kwargs(self, tokenizer_kwargs: Dict[str, Any], curr_tokenizer_kwargs: Dict[str, Any]) -> None:
+        assert len(tokenizer_kwargs) == 0
+
+    @staticmethod
+    def pad_sequence(sequences: List[torch.Tensor],
+                     padding_value: float = 0.,
+                     padding_side: Literal['right', 'left'] = 'right') -> torch.Tensor:
+        """Pad sequence by some side
+
+        Args:
+            sequences: The input sequences in tensor.
+            padding_value: The padding value
+            padding_side: The padding side
+
+        Returns:
+            A tensor after padding
+        """
+        padding_right = padding_side == 'right'
+        if padding_right:
+            return pad_sequence(sequences, batch_first=True, padding_value=padding_value)
+
+        max_len = max([s.size(0) for s in sequences])
+
+        padded_sequences = []
+        for seq in sequences:
+            pad_length = max_len - seq.size(0)
+            pad_tuple = [0] * ((seq.dim() - 1) * 2) + [pad_length, 0]
+            padded_seq = F.pad(seq, tuple(pad_tuple), 'constant', padding_value)
+            padded_sequences.append(padded_seq)
+
+        return torch.stack(padded_sequences)
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        """
+        Args:
+            batch(`List[Dict[str, Any]]`): The input data in batch
+            padding_to(`int`, optional): Whether padding the batch to a fixed length, if none, the batch
+                will be padded to the `longest`
+        """
+        tokenizer = self.tokenizer
+        assert tokenizer.pad_token_id is not None
+        padding_right = self.padding_side == 'right'
+        res = {}
+
+        if 'inputs_embeds' in batch[0]:
+            inputs_embeds = [b['inputs_embeds'] for b in batch]
+            res['inputs_embeds'] = inputs_embeds
+            res['attention_mask'] = [
+                torch.ones((inputs_embeds[i].shape[0]), dtype=torch.int64) for i in range(len(inputs_embeds))
+            ]
+        elif 'input_ids' in batch[0]:
+            input_ids = [torch.tensor(b['input_ids']) for b in batch]
+            res['input_ids'] = input_ids
+            res['attention_mask'] = [torch.ones(len(input_ids[i]), dtype=torch.int64) for i in range(len(input_ids))]
+
+        for key in ['labels', 'loss_scale', 'position_ids']:
+            if key in batch[0]:
+                res[key] = [torch.tensor(b[key]) for b in batch]
+
+        if padding_to is not None:
+            assert 'input_ids' in res
+            padding_len = padding_to - res['input_ids'][0].shape[-1]
+            if padding_len > 0:
+                for key, value in zip(['input_ids', 'attention_mask', 'labels', 'loss_scale', 'position_ids'],
+                                      [tokenizer.pad_token_id, 0, -100, 0., -1]):
+                    if key in res:
+                        res[key][0] = F.pad(res[key][0], (0, padding_len) if padding_right else (padding_len, 0),
+                                            'constant', value)
+        for key, value in zip(['input_ids', 'inputs_embeds', 'attention_mask', 'labels', 'loss_scale', 'position_ids'],
+                              [tokenizer.pad_token_id, 0., 0, -100, 0., -1]):
+            if key in res:
+                res[key] = self.pad_sequence(res[key], value, self.padding_side)
+
+        if '_data' in batch[0]:
+            res['_data'] = [b['_data'] for b in batch]
+        # multimodal
+        pixel_values = [b['pixel_values'] for b in batch if b.get('pixel_values') is not None]
+        if len(pixel_values) > 0:
+            res['pixel_values'] = torch.concat(pixel_values)
+
+            image_sizes = [b['image_sizes'] for b in batch if b.get('image_sizes') is not None]
+            if len(image_sizes) > 0:
+                res['image_sizes'] = torch.concat(image_sizes)
+
+        pixel_values_videos = [b['pixel_values_videos'] for b in batch if b.get('pixel_values_videos') is not None]
+        if len(pixel_values_videos) > 0:
+            res['pixel_values_videos'] = torch.concat(pixel_values_videos)
+        return res
+
+    @classmethod
+    def get_generate_ids(cls, generate_ids: torch.Tensor, input_token_len: int) -> List[int]:
+        if isinstance(generate_ids, torch.Tensor):
+            generate_ids = generate_ids.tolist()
+        if len(generate_ids) >= 1 and isinstance(generate_ids[0], (list, tuple)):
+            generate_ids = generate_ids[0]
+        return cls._get_generate_ids(generate_ids, input_token_len)
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids[input_token_len:]
+
+    @staticmethod
+    def _is_chinese_char(cp: int) -> bool:
+        """Checks whether CP is the codepoint of a CJK character."""
+        # copy from transformers.generation.streamers.TextStreamer
+        if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF) or (0x20000 <= cp <= 0x2A6DF)
+                or (0x2A700 <= cp <= 0x2B73F) or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF)
+                or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)):
+            return True
+
+        return False
+
+    @classmethod
+    def _get_safe_print_idx(cls, response: str, print_idx: int, is_finished: bool = False) -> int:
+        if is_finished:
+            return len(response)
+        if response.endswith('\n') or len(response) > 0 and cls._is_chinese_char(ord(response[-1])):
+            print_idx = len(response)
+        else:
+            print_idx = max(response.rfind(' ') + 1, print_idx)
+        return print_idx
+
+    def generate_ids_to_response(
+        self,
+        generate_ids: List[int],
+        is_finished: bool = True,
+        *,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        # only stream=True
+        return_delta: bool = False,
+        print_idx: Optional[List[int]] = None,
+        first_num_space: Optional[List[int]] = None,
+    ):
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        tokenizer = self.tokenizer
+        if hasattr(generate_ids, 'tolist'):
+            generate_ids = generate_ids.tolist()
+        # avoid printing template.suffix[-1])
+        if isinstance(self.suffix[-1], list) and (not is_finished or is_finished
+                                                  and generate_ids[-len(self.suffix[-1]):] == self.suffix[-1]):
+            generate_ids = generate_ids[:-len(self.suffix[-1])]
+        if not is_finished or is_finished and generate_ids[-1:] == [self.tokenizer.eos_token_id]:
+            generate_ids = generate_ids[:-1]
+        response = tokenizer.decode(generate_ids, **tokenizer_kwargs)
+        if first_num_space is not None:
+            # Avoid the occurrence of repeated words in sentence.
+            res_fns = first_num_space  # res_first_num_space
+            first_num_space = first_num_space[0]
+            cur_num_space = len(response) - len(response.lstrip(' '))
+            if not is_finished and first_num_space == -1:
+                first_num_space = cur_num_space
+                res_fns[0] = first_num_space
+            if cur_num_space < first_num_space:
+                response = ' ' * (first_num_space - cur_num_space) + response
+            elif cur_num_space > first_num_space:
+                response = response[cur_num_space - first_num_space:]
+        if isinstance(self.suffix[-1],
+                      str) and (not is_finished or is_finished and response[-len(self.suffix[-1]):] == self.suffix[-1]):
+            idx = max(len(response) - len(self.suffix[-1]), 0)
+            # To avoid response length being shorter than previous response length during streaming.
+            if print_idx is not None:
+                idx = max(idx, print_idx[0])
+            response = response[:idx]
+
+        if print_idx is not None:
+            old_print_idx = print_idx[0]
+            if not is_finished:
+                # avoid printing incomplete words
+                print_idx[0] = self._get_safe_print_idx(response, print_idx[0])
+                response = response[:print_idx[0]]
+            if return_delta:
+                response = response[old_print_idx:]
+        else:
+            assert is_finished and not return_delta
+        return response
+
+    def post_process_generate_response(self, response: str, example: dict) -> str:
+        return response
diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py
new file mode 100644
index 000000000..97145199d
--- /dev/null
+++ b/modelscope/preprocessors/templates/loader.py
@@ -0,0 +1,979 @@
+import re
+import json
+from dataclasses import dataclass
+from typing import Any, Dict, List, Union, Optional, Tuple
+
+import requests
+
+from modelscope import AutoTokenizer, get_logger, snapshot_download, AutoConfig
+from . import TemplateType
+from .base import Template, get_template
+
+logger = get_logger()
+
+
+@dataclass
+class TemplateInfo:
+
+    template: str = None
+    template_regex: str = None
+    modelfile_prefix: str = None
+
+
+def cases(*names):
+    ret = []
+    for name in names:
+        regex = ''
+        for letter in name:
+            if letter.upper() != letter.lower():
+                regex += f'[{letter.upper()}{letter.lower()}]'
+            else:
+                regex += letter
+        ret.append(regex)
+    if len(ret) > 1:
+        ret = '|'.join(ret)
+        ret = '(' + ret + ')'
+    else:
+        ret = ret[0]
+    return ret
+
+
+chat_suffix = cases('instruct', 'chat', '-rl', '-it')
+
+
+def no(*names):
+    return f'(?!.*{cases(*names)})'
+
+
+def no_multi_modal():
+    return no('audio', 'video', 'vl', 'vision')
+
+
+# Order matters
+template_info = [
+    # llama
+    ## "llama3"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama3.2", "llama-3.2")}.*{cases("vision")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3.2-vision',
+    ),
+    TemplateInfo(
+        template=TemplateType.llama3,
+        template_regex=
+        f'.*{cases("llama3.2", "llama-3.2")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3.2',
+    ),
+    TemplateInfo(
+        template=TemplateType.llama3,
+        template_regex=
+        f'.*{cases("llama3.1", "llama-3.1")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3.1',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama3", "llama-3")}.*{no_multi_modal()}.*{chat_suffix}.*{cases("gradient")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3-gradient',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama3", "llama-3")}.*{no_multi_modal()}.*{cases("groq")}.*{cases("tool-use", "tool_use")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3-groq-tool-use',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama3", "llama-3")}.*{no_multi_modal()}.*{cases("chatqa")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3-chatqa',
+    ),
+    TemplateInfo(
+        template_regex=f'.*{cases("llava-llama-3")}.*',
+        modelfile_prefix='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-llama3'),
+    TemplateInfo(
+        template_regex=f'.*{cases("dolphin")}.*{cases("llama3")}.*',
+        modelfile_prefix='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/dolphin-llama3'),
+    TemplateInfo(
+        template=TemplateType.llama3,
+        template_regex=
+        f'.*{cases("llama3", "llama-3")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama3',
+    ),
+
+    ## "llama"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama2", "llama-2")}{no_multi_modal()}.*{cases("chinese")}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama2-chinese',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("codellama")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/codellama',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("tinyllama")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/tinyllama',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama-pro", "llama_pro")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama-pro',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llama")}.*{cases("guard")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama-guard3',
+    ),
+    TemplateInfo(
+        template=TemplateType.llama,
+        template_regex=
+        f'.*{cases("llama")}{no_multi_modal()}.*{chat_suffix}.*',
+                modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama2',
+    ),
+
+    # qwen
+    TemplateInfo(
+        template=TemplateType.qwen,
+        template_regex=f'.*{cases("qwen2.5")}.*{cases("coder")}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/qwen2.5-coder',
+    ),
+    TemplateInfo(
+        template=TemplateType.qwen,
+        template_regex=f'.*{cases("qwen2.5")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/qwen2.5',
+    ),
+    TemplateInfo(
+        template_regex=f'.*{cases("qwen2-math")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/qwen2-math',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("codeqwen1.5", "codeqwen-1.5")}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/codeqwen',
+    ),
+    TemplateInfo(
+        template=TemplateType.qwen,
+        template_regex=f'.*{cases("qwen2", "qwen1.5")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/qwen2',
+    ),
+    TemplateInfo(
+        template=TemplateType.qwen,
+        template_regex=f'.*{cases("qwen")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/qwen',
+    ),
+
+    # gemma
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("codegemma")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/codegemma',
+    ),
+    TemplateInfo(
+        template=TemplateType.gemma,
+        template_regex=
+        f'{no("pali")}.*{cases("gemma2", "gemma-2")}\\b.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/gemma2',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("shieldgemma")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/shieldgemma',
+    ),
+    TemplateInfo(
+        template=TemplateType.gemma,
+        template_regex=
+        f'{no("pali")}.*{cases("gemma")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/gemma',
+    ),
+
+    # "dolphin"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("dolphin")}.*{cases("-mixtral")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/dolphin-mixtral',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("dolphin")}.*{cases("mistral")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/dolphin-mistral',
+    ),
+
+    # "phi"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llava-phi3", "llava-phi-3")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava-phi3',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("phi3.5", "phi-3.5")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/phi3.5',
+    ),
+    TemplateInfo(
+        template=TemplateType.phi3,
+        template_regex=
+        f'.*{cases("phi3", "phi-3")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/phi3',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("phi")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/phi',
+    ),
+
+    # "mistral"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("yarn")}.*{cases("mistral")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yarn-mistral',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("mistral")}.*{cases("large")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistral-large',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("mistral")}.*{cases("small")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistral-small',
+    ),
+    TemplateInfo(
+        template=TemplateType.mistral_nemo,
+        template_regex=f'.*{cases("Mistral-Nemo")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistral-nemo',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("mistral")}.*{cases("openorca")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistral-openorca',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("mistrallite")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistrallite',
+    ),
+    ## other mistral: set Type.llama
+    TemplateInfo(
+        template=TemplateType.llama,
+        template_regex=
+        f'.*{cases("mistral")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistral',
+    ),
+
+    # "mixtral"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("nous-hermes2", "nous-hermes-2")}.*{cases("mixtral")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nous-hermes2-mixtral',
+    ),
+    TemplateInfo(
+        template=TemplateType.llama,
+        template_regex=
+        f'.*{cases("mixtral")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mixtral',
+    ),
+
+    # codestral
+    TemplateInfo(
+        template=TemplateType.llama,
+        template_regex=
+        f'.*{cases("codestral")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/codestral',
+    ),
+
+    # nous-hermes2
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("nous-hermes2", "nous-hermes-2")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nous-hermes2',
+    ),
+        TemplateInfo(
+        template_regex=f'.*{cases("nous-hermes")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nous-hermes'),
+
+    # "deepseek"
+    TemplateInfo(
+        template=TemplateType.deepseek2_5,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("v2.5")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-v2.5',
+    ),
+    TemplateInfo(
+        template=TemplateType.deepseek_coder,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("coder")}.*{cases("v2")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-coder-v2',
+    ),
+    TemplateInfo(
+        template=TemplateType.deepseek_coder,
+        template_regex=
+        f'.*{cases("deepseek")}{no("v2", "v2.5")}.*{cases("coder")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-coder',
+    ),
+    TemplateInfo(
+        template=TemplateType.deepseek2,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("v2")}{no("v2.5")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-v2',
+    ),
+    TemplateInfo(
+        template=TemplateType.deepseek,
+        template_regex=
+        f'.*{cases("deepseek")}{no("v2", "v2.5", "coder")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek-llm',
+    ),
+
+    # "yi"
+    TemplateInfo(
+        template=TemplateType.yi_coder,
+        template_regex=f'.*{cases("yi")}.*{cases("coder")}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-coder',
+    ),
+    TemplateInfo(
+        template=TemplateType.chatml,
+        template_regex=
+        f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi',
+    ),
+
+    # "llava"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("bakllava")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/bakllava',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("llava")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llava',
+    ),
+
+    # "nemotron"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("nemotron-mini")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nemotron-mini',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("nemotron")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nemotron',
+    ),
+
+    # "minicpm"
+    TemplateInfo(
+        template_regex=f'.*{cases("minicpm-v")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/minicpm-v'
+    ),
+    TemplateInfo(
+        template=TemplateType.chatml,
+        template_regex=f'.*{cases("minicpm")}{no("-v")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi'
+    ),
+
+    # chatglm
+    TemplateInfo(
+        template=TemplateType.chatglm2,
+        template_regex=f'.*{cases("chatglm2")}{no_multi_modal()}.*'),
+    TemplateInfo(
+        template=TemplateType.chatglm3,
+        template_regex=f'.*{cases("chatglm3")}{no_multi_modal()}.*'),
+    TemplateInfo(
+        template=TemplateType.chatglm4,
+        template_regex=f'.*{cases("glm4", "glm-4")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4',
+    ),
+
+    # baichuan
+    TemplateInfo(
+        template=TemplateType.baichuan,
+        template_regex=
+        f'.*{cases("baichuan")}{no_multi_modal()}.*{chat_suffix}.*'),
+
+    # "command-r"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("command-r-plus")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/command-r-plus',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("command-r")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/command-r',
+    ),
+
+    # codegeex
+    TemplateInfo(
+        template=TemplateType.codegeex4,
+        template_regex=f'.*{cases("codegeex4")}{no_multi_modal()}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/codegeex4',
+    ),
+
+    # wizard
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("wizard-vicuna")}.*{cases("uncensored")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/wizard-vicuna-uncensored',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("wizardlm2", "wizardlm-2")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/wizardlm2',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("wizardcoder")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/wizardcoder',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("wizard-math", "wizardmath")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/wizard-math',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("wizardlm")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/wizardlm',
+    ),
+
+    # vicuna
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("vicuna")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/vicuna',
+    ),
+
+    # "stable"
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("stable-code")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/stable-code',
+    ),
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("stablelm")}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/stablelm2',
+    ),
+
+    # idefics3
+    TemplateInfo(
+        template=TemplateType.idefics3,
+        template_regex=f'.*{cases("idefics3")}{no_multi_modal()}.*'),
+
+    # internlm
+    TemplateInfo(
+        template=TemplateType.internlm,
+        template_regex=
+        f'.*{cases("internlm")}{no("internlm2", "internlm3")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # internlm2
+    TemplateInfo(
+        template=TemplateType.internlm2,
+        template_regex=
+        f'.*{cases("internlm2")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/internlm2',
+    ),
+
+    # yuan
+    TemplateInfo(
+        template=TemplateType.yuan,
+        template_regex=f'.*{cases("Yuan")}{no_multi_modal()}.*'),
+
+    # xverse
+    TemplateInfo(
+        template=TemplateType.xverse,
+        template_regex=f'.*{cases("xverse")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # skywork
+    TemplateInfo(
+        template=TemplateType.skywork,
+        template_regex=
+        f'.*{cases("skywork")}{no_multi_modal()}.*{chat_suffix}.*'),
+
+    # bluelm
+    TemplateInfo(
+        template=TemplateType.bluelm,
+        template_regex=f'.*{cases("bluelm")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # zephyr
+    TemplateInfo(
+        template=TemplateType.zephyr,
+        template_regex=f'.*{cases("zephyr")}{no_multi_modal()}.*'),
+
+    # deepseek
+    TemplateInfo(
+        template=TemplateType.deepseek,
+        template_regex=
+        f'.*{cases("deepseek")}{no("v2", "v2.5", "coder")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # deepseek2
+    TemplateInfo(
+        template=TemplateType.deepseek2,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("v2")}{no("v2.5")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek_v2',
+    ),
+
+    # deepseek_coder
+    TemplateInfo(
+        template=TemplateType.deepseek_coder,
+        template_regex=
+        f'.*{cases("deepseek")}{no("v2", "v2.5")}.*{cases("coder")}.*{chat_suffix}.*'
+    ),
+
+    # deepseek v2.5
+    TemplateInfo(
+        template=TemplateType.deepseek2_5,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("v2.5")}{no_multi_modal()}.*'),
+
+    # orion
+    TemplateInfo(
+        template=TemplateType.orion,
+        template_regex=f'.*{cases("orion")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # telechat
+    TemplateInfo(
+        template=TemplateType.telechat,
+        template_regex=f'.*{cases("TeleChat")}{no("v2")}.*'),
+
+    # telechat_v2
+    TemplateInfo(
+        template=TemplateType.telechat_v2,
+        template_regex=f'.*{cases("TeleChat")}.*{cases("v2")}.*'),
+
+    TemplateInfo(
+        template_regex=f'.*{cases("nomic-embed-text")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nomic-embed-text'),
+    TemplateInfo(
+        template_regex=f'.*{cases("mxbai-embed-large")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mxbai-embed-large'),
+    TemplateInfo(
+        template_regex=f'.*{cases("starcoder2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/starcoder2'),
+    TemplateInfo(
+        template_regex=f'.*{cases("orca-mini", "orca_mini")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/orca-mini'),
+    TemplateInfo(
+        template_regex=f'.*{cases("zephyr")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/zephyr'),
+    TemplateInfo(
+        template_regex=f'.*{cases("snowflake-arctic-embed")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/snowflake-arctic-embed'),
+    TemplateInfo(
+        template_regex=f'.*{cases("starcoder")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/starcoder'),
+    TemplateInfo(
+        template_regex=f'.*{cases("granite")}.*{cases("code")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/granite-code'),
+    TemplateInfo(
+        template_regex=f'.*{cases("all-minilm")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/all-minilm'),
+    TemplateInfo(
+        template_regex=f'.*{cases("openchat")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/openchat'),
+    TemplateInfo(
+        template_regex=f'.*{cases("aya")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/aya'),
+    TemplateInfo(
+        template_regex=f'.*{cases("openhermes")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/openhermes'),
+    TemplateInfo(
+        template_regex=f'.*{cases("reflection")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/reflection'),
+    TemplateInfo(
+        template_regex=f'.*{cases("neural-chat")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/neural-chat'),
+    TemplateInfo(
+        template_regex=f'.*{cases("moondream")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/moondream'),
+    TemplateInfo(
+        template_regex=f'.*{cases("xwin")}.*{cases("lm")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/xwinlm'),
+    TemplateInfo(
+        template_regex=f'.*{cases("smollm")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/smollm'),
+    TemplateInfo(
+        template_regex=f'.*{cases("sqlcoder")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/sqlcoder'),
+    TemplateInfo(
+        template_regex=f'.*{cases("starling-lm")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/starling-lm'),
+    TemplateInfo(
+        template_regex=f'.*{cases("falcon")}.*{cases("-2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon2'),
+    TemplateInfo(
+        template_regex=f'.*{cases("falcon")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/falcon'),
+    TemplateInfo(
+        template_regex=f'.*{cases("solar-pro")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/solar-pro'),
+    TemplateInfo(
+        template_regex=f'.*{cases("solar")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/solar'),
+    TemplateInfo(
+        template_regex=f'.*{cases("orca2", "orca-2", "orca_2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/orca2'),
+    TemplateInfo(
+        template_regex=f'.*{cases("hermes3", "hermes-3", "hermes_3")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/hermes3'),
+    TemplateInfo(
+        template_regex=f'.*{cases("meditron")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/meditron'),
+    TemplateInfo(
+        template_regex=f'.*{cases("nexusraven")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nexusraven'),
+    TemplateInfo(
+        template_regex=f'.*{cases("magicoder")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/magicoder'),
+    TemplateInfo(
+        template_regex=f'.*{cases("bge-m3")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/bge-m3'),
+    TemplateInfo(
+        template_regex=f'.*{cases("notux")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/notux'),
+    TemplateInfo(
+        template_regex=f'.*{cases("open")}.*{cases("orca")}.*{cases("platypus2")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/open-orca-platypus2'),
+    TemplateInfo(
+        template_regex=f'.*{cases("notus")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/notus'),
+    TemplateInfo(
+        template_regex=f'.*{cases("mathstral")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mathstral'),
+    TemplateInfo(
+        template_regex=f'.*{cases("dbrx")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/dbrx'),
+    TemplateInfo(
+        template_regex=f'.*{cases("nuextract")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/nuextract'),
+    TemplateInfo(
+        template_regex=f'.*{cases("reader-lm")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/reader-lm'),
+    TemplateInfo(
+        template_regex=f'.*{cases("alfred")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/alfred'),
+    TemplateInfo(
+        template_regex=f'.*{cases("bge-large")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/bge-large'),
+    TemplateInfo(
+        template_regex=f'.*{cases("paraphrase-multilingual")}.*', 
+        modelfile_prefix=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/paraphrase-multilingual'),
+
+]
+
+
+class TemplateLoader:
+
+    @staticmethod
+    def load_by_model_id(model_id: str, **kwargs) -> Template:
+        """Load a template by model-id
+
+        Args:
+            model_id: The model-id used to load the proper template
+            kwargs:
+                revision: the revision of the model, default is `master`
+        Returns:
+            The template instance
+        """
+        ignore_file_pattern = [r'.+\.bin$', r'.+\.safetensors$', r'.+\.gguf$']
+        tokenizer = kwargs.get('tokenizer')
+        config = kwargs.get('config')
+        for _info in template_info:
+            if re.fullmatch(_info.template_regex, model_id):
+                if _info.template:
+                    if tokenizer is None:
+                        try:
+                            model_dir = snapshot_download(
+                                model_id,
+                                revision=kwargs.pop('revision', 'master'),
+                                ignore_file_pattern=ignore_file_pattern)
+                            tokenizer = AutoTokenizer.from_pretrained(
+                                model_dir, trust_remote_code=True)
+                            config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+                        except Exception:
+                            pass
+                    return TemplateLoader.load_by_template_name(
+                        _info.template, tokenizer=tokenizer, config=config, **kwargs)
+
+    @staticmethod
+    def load_by_template_name(template_name: str, **kwargs) -> Template:
+        """Load a template by model-id
+
+        Args:
+            template_name: The template name used to load the proper template
+            kwargs:
+                tokenizer: The tokenizer of the model
+                default_system: The extra default system info
+                max_length: The max_length for the sequence
+                truncation_strategy: 'delete' or 'truncation_left' the sequence of the length exceeds the limit
+        Returns:
+            The template instance
+        """
+        template = get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs)
+        template.config = kwargs.get('config')
+        return template
+
+    @staticmethod
+    def replace_and_concat(template: Template, template_list: List,
+                           placeholder: str, keyword: str):
+        final_str = ''
+        for t in template_list:
+            if isinstance(t, str):
+                final_str += t.replace(placeholder, keyword)
+            elif isinstance(t, (tuple, list)):
+                if isinstance(t[0], int):
+                    final_str += template.tokenizer.decode(t)
+                else:
+                    for attr in t:
+                        if attr == 'bos_token_id':
+                            final_str += template.tokenizer.bos_token
+                        elif attr == 'eos_token_id':
+                            final_str += template.tokenizer.eos_token
+                        else:
+                            raise ValueError(f'Unknown token: {attr}')
+        return final_str
+
+    @staticmethod
+    def _format_return(template_lines: str, params: Dict, split: bool, license: Optional[str] = None) -> Union[str, Dict]:
+        if split:
+            if params:
+                params = json.dumps(params)
+            return {'params': params, 'template': template_lines, 'license': license}
+
+        content = ''
+        content += 'FROM {gguf_file}\n\n'
+        if params:
+            for key, values in params.items():
+                if isinstance(values, list):
+                    for value in values:
+                        content += f'PARAMETER {key} {json.dumps(value)}\n'
+                else:
+                    content += f'PARAMETER {key} {json.dumps(values)}\n'
+            content += '\n'
+        if template_lines:
+            content += ('TEMPLATE """' + template_lines + '"""\n')
+        return content
+
+    @staticmethod
+    def to_ollama(model_id: str = None,
+                  template_name: str = None,
+                  gguf_file: str = None,
+                  gguf_meta: Dict[str, Any] = None,
+                  split: bool = False,
+                  debug: bool = False,
+                  **kwargs) -> Union[str, Dict, Tuple[Dict, TemplateInfo], Tuple[str, TemplateInfo], None]:
+        """Export to ollama ModelFile
+
+        Args:
+            model_id: The model-id to use
+            template_name: An extra template name to use
+            gguf_file: An extra gguf_file path to use in the `FROM` field
+            gguf_meta: An gguf extra meta info
+            split: bool. Return str modelfile content, or dict of params and template
+            debug: bool. Whether or not to return the matched TemplateInfo
+        Returns:
+            The ModelFile content, or dictionary of params and template, returns `None` if no template found
+        """
+
+        if not model_id and not template_name and not gguf_meta:
+            raise ValueError(
+                f'Please make sure you model_id: {model_id} '
+                f'and template_name: {template_name} is supported.')
+        logger.info('Exporting to ollama:')
+        names = []
+        if gguf_meta:
+            gguf_header_name = gguf_meta.get("general.name", None)
+            names.append(gguf_header_name)
+        if model_id:
+            names.append(model_id)
+        for name in names:
+            for _info in template_info:
+                if re.fullmatch(_info.template_regex, name):
+                    if _info.modelfile_prefix and not kwargs.get('ignore_oss_model_file', False):
+                        template_str = TemplateLoader._read_content_from_url(
+                            _info.modelfile_prefix + '.template')
+                        if not template_str:
+                            logger.info(f'{name} has no template file.')
+                        params = TemplateLoader._read_content_from_url(_info.modelfile_prefix + '.params')
+                        if params:
+                            params = json.loads(params)
+                        else:
+                            logger.info(f'{name} has no params file.')
+                        license = TemplateLoader._read_content_from_url(
+                            _info.modelfile_prefix + '.license')
+                        if not template_str:
+                            logger.info(f'{name} has no license file.')
+                        format_out = TemplateLoader._format_return(template_str, params, split, license)
+                        if debug:
+                            return format_out, _info
+                        return format_out
+        if template_name:
+            template = TemplateLoader.load_by_template_name(
+                template_name, **kwargs)
+        else:
+            template = TemplateLoader.load_by_model_id(
+                model_id, **kwargs)
+
+        if not template:
+            return None
+            
+        # template
+        template_lines = ''
+        _prefix = TemplateLoader.replace_and_concat(template, template.prefix, "", "")
+        if _prefix:
+            template_lines += (
+                f'{{{{ if .System }}}}'
+                f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
+                f'{{{{ else }}}}{_prefix}'
+                f'{{{{ end }}}}')
+        else:
+            template_lines += (
+                f'{{{{ if .System }}}}'
+                f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
+                f'{{{{ end }}}}')
+        template_lines += (
+            f'{{{{ if .Prompt }}}}'
+            f'{TemplateLoader.replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
+            f'{{{{ end }}}}')
+        template_lines += '{{ .Response }}'
+        template_lines += TemplateLoader.replace_and_concat(template, template.suffix,
+                                                     '', '')
+        # stop tokens
+        all_eos_tokens = {TemplateLoader.replace_and_concat(template, template.suffix, "", "")}
+        if getattr(template, 'tokenizer', None):
+            eos_token = TemplateLoader.replace_and_concat(template, [["eos_token_id"]], "", "")
+            all_eos_tokens.add(eos_token)
+            if getattr(template, 'config', None) and getattr(template.config, 'eos_token_id'):
+                eos_token_id = template.config.eos_token_id
+                eos_token = TemplateLoader.replace_and_concat(template, [[eos_token_id]], "", "")
+                all_eos_tokens.add(eos_token)
+
+        stop_tokens = list()
+        for eos_token in all_eos_tokens:
+            stop_tokens.append(eos_token)
+        params = {'stop': stop_tokens}
+
+        return TemplateLoader._format_return(template_lines, params, split)
+
+    @staticmethod
+    def _read_content_from_url(url):
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+        except requests.exceptions.HTTPError as e:
+            return None
+        content = response.content
+        return content.decode('utf-8')
diff --git a/modelscope/preprocessors/templates/loss_scale.py b/modelscope/preprocessors/templates/loss_scale.py
new file mode 100644
index 000000000..e253c746a
--- /dev/null
+++ b/modelscope/preprocessors/templates/loss_scale.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+from .utils import split_str_parts_by, split_parts_by_regex
+
+
+def calculate_loss_scale(query: str,
+                         response: str,
+                         response_loss_scale_map: Optional[Dict[str, list]] = None,
+                         query_loss_scale_map: Optional[Dict[str, list]] = None) -> Tuple[List[str], List[float]]:
+    """Calculate the loss scale by splitting the agent response.
+
+    This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
+
+    Agent response format:
+
+    ```text
+        Thought: you should always think about what to do
+        Action: the action to take, should be one of the above tools[fire_recognition,
+            fire_alert, call_police, call_fireman]
+        Action Input: the input to the action
+        Observation: the result of the action
+        ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+        Thought: I now know the final answer
+        Final Answer: the final answer to the original input question
+    ```
+    Returns:
+        A tuple of agent response parts and their weights.
+    """
+    # query loss scale map
+    if query_loss_scale_map is not None:
+        for key in query_loss_scale_map.keys():
+            if key in query:
+                if isinstance(query_loss_scale_map[key], (float, int)):
+                    query_loss_scale_map[key] = [query_loss_scale_map[key]]
+                loss_scale_value = query_loss_scale_map[key][0]
+                return [response], [float(loss_scale_value)]
+    delimiters = list(k for k in response_loss_scale_map.keys() if len(response_loss_scale_map[k]) == 2)
+    agent_parts = split_str_parts_by(response, delimiters)
+    regex_delimiters = {k: v for k, v in response_loss_scale_map.items() if len(v) == 1}
+    if len(regex_delimiters):
+        split_parts_by_regex(agent_parts, regex_delimiters)
+    weights = []
+    agent_content = []
+    for c in agent_parts:
+        if isinstance(c['key'], (float, int)):
+            weights += [c['key']]
+            agent_content.append(c['content'])
+        else:
+            if c['key'] in response_loss_scale_map:
+                weights += [response_loss_scale_map[c['key']][0]]
+                weights += [response_loss_scale_map[c['key']][1]]
+                agent_content.append(c['key'])
+                agent_content.append(c['content'])
+            else:
+                weights += [1.0]
+                agent_content.append(c['content'])
+    return agent_content, weights
+
+
+def alpha_umi_loss_scale(query: str, response: str):
+    cwd = os.getcwd()
+    loss_scale_config_path = 'alpha_umi_loss_scale_config.json'
+    config_path = os.path.join(cwd, loss_scale_config_path)
+    with open(config_path, 'r') as json_file:
+        loss_scale_map = json.load(json_file)
+    return calculate_loss_scale(query, response, loss_scale_map)
+
+
+def agentflan_loss_scale(query: str, response: str):
+    cwd = os.getcwd()
+    loss_scale_config_path = 'agentflan.json'
+    config_path = os.path.join(cwd, loss_scale_config_path)
+    with open(config_path, 'r') as json_file:
+        loss_scale_map = json.load(json_file)
+    query_loss_scale_map = loss_scale_map['query']
+    response_loss_scale_map = loss_scale_map['response']
+    return calculate_loss_scale(query, response, response_loss_scale_map, query_loss_scale_map)
+
+
+def react_loss_scale(query: str, response: str):
+    cwd = os.getcwd()
+    loss_scale_config_path = 'default_loss_scale_config.json'
+    config_path = os.path.join(cwd, loss_scale_config_path)
+    with open(config_path, 'r') as json_file:
+        loss_scale_map = json.load(json_file)
+    return calculate_loss_scale(query, response, loss_scale_map)
+
+
+def default_loss_scale(query: str, response: str):
+    return [response], [1.0]
+
+
+loss_scale_map = {
+    'agentflan': agentflan_loss_scale,
+    'react': react_loss_scale,
+    'alpha_umi': alpha_umi_loss_scale,
+    'default': default_loss_scale,
+}
diff --git a/modelscope/preprocessors/templates/template.py b/modelscope/preprocessors/templates/template.py
new file mode 100644
index 000000000..762bcde09
--- /dev/null
+++ b/modelscope/preprocessors/templates/template.py
@@ -0,0 +1,2274 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+import re
+from functools import partial
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, TypeVar, Union
+
+import torch
+import transformers
+from packaging import version
+from transformers import PreTrainedTokenizerBase
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from modelscope import get_logger
+from .base import Template, TEMPLATE_MAPPING
+from .utils import (load_audio_qwen, load_batch, load_image, load_video_cogvlm2, load_video_internvl,
+                    load_video_llava, load_video_minicpmv_mplug_owl3, load_video_qwen2,
+                    transform_image, upper_bound, fetch_one)
+
+logger = get_logger()
+
+DEFAULT_SYSTEM = 'You are a helpful assistant.'
+History = List[Union[Tuple[str, str], List[str]]]
+Prompt = List[Union[str, List[int], List[str]]]
+StopWords = Prompt
+Context = Union[str, List[int]]
+
+
+class TemplateType:
+    # text-generation
+    default_generation = 'default-generation'
+    chatglm_generation = 'chatglm-generation'
+    qwen_vl_generation = 'qwen-vl-generation'
+    qwen_audio_generation = 'qwen-audio-generation'
+    # chat
+    default = 'default'
+    qwen = 'qwen'
+    qwen_vl = 'qwen-vl'
+    qwen_audio = 'qwen-audio'
+    qwen2_audio = 'qwen2-audio'
+    qwen2_audio_generation = 'qwen2-audio-generation'
+    qwen2_vl = 'qwen2-vl'
+    modelscope_agent = 'modelscope-agent'
+    baichuan = 'baichuan'
+    chatglm2 = 'chatglm2'
+    chatglm3 = 'chatglm3'
+    chatglm4 = 'chatglm4'
+    codegeex4 = 'codegeex4'
+    llama = 'llama'  # llama2
+    llama3 = 'llama3'
+    reflection = 'reflection'
+    longwriter_llama3 = 'longwriter-llama3'
+    # llava-hf
+    llava1_5 = 'llava1_5'
+    llava_mistral = 'llava-mistral'
+    llava_vicuna = 'llava-vicuna'
+    llava_yi = 'llava-yi'
+    llama3_llava_next_hf = 'llama-llava-next-hf'
+    llava_next_llama3 = 'llava-next-llama3'
+    llava_qwen_hf = 'llama-qwen-hf'
+    llava_onevision_qwen = 'llava-onevision-qwen'
+    # llava-video
+    llava_next_video = 'llava-next-video'
+    llava_next_video_yi = 'llava-next-video-yi'
+    # lmms-lab:llava
+    llama3_llava_next = 'llama3-llava-next'
+    llava_qwen = 'llava-qwen'
+    # xtuner:llava
+    llava_llama_instruct = 'llava-llama-instruct'
+
+    idefics3 = 'idefics3'
+    mistral_nemo = 'mistral-nemo'
+    openbuddy = 'openbuddy'
+    openbuddy2 = 'openbuddy2'
+    internlm = 'internlm'
+    internlm2 = 'internlm2'
+    internlm_xcomposer2 = 'internlm-xcomposer2'
+    internlm_xcomposer2_4khd = 'internlm-xcomposer2-4khd'
+    internlm_xcomposer2_5 = 'internlm-xcomposer2_5'
+    internvl = 'internvl'
+    internvl2 = 'internvl2'
+    internvl_phi3 = 'internvl-phi3'
+    internvl2_phi3 = 'internvl2-phi3'
+    florence = 'florence'
+    yi_coder = 'yi-coder'
+    yi_vl = 'yi-vl'
+    yuan = 'yuan'
+    xverse = 'xverse'
+    ziya = 'ziya'
+    skywork = 'skywork'
+    bluelm = 'bluelm'
+    zephyr = 'zephyr'
+    sus = 'sus'
+    deepseek = 'deepseek'
+    numina_math = 'numina-math'
+    deepseek_coder = 'deepseek-coder'
+    deepseek_vl = 'deepseek-vl'
+    deepseek2 = 'deepseek2'
+    deepseek2_5 = 'deepseek2_5'
+    codefuse_codellama = 'codefuse-codellama'
+    codefuse = 'codefuse'
+    cogvlm = 'cogvlm'
+    cogvlm2_video = 'cogvlm2-video'
+    glm4v = 'glm4v'
+    cogagent_chat = 'cogagent-chat'
+    cogagent_instruct = 'cogagent-instruct'
+    orion = 'orion'
+    minicpm = 'minicpm'
+    minicpm_v = 'minicpm-v'
+    minicpm_v_v2_5 = 'minicpm-v-v2_5'
+    minicpm_v_v2_6 = 'minicpm-v-v2_6'
+    gemma = 'gemma'
+    paligemma = 'paligemma'
+    mplug_owl2 = 'mplug-owl2'
+    mplug_owl3 = 'mplug_owl3'
+    wizardlm2_awq = 'wizardlm2-awq'
+    wizardlm2 = 'wizardlm2'
+    atom = 'atom'
+    phi3 = 'phi3'
+    phi3_vl = 'phi3-vl'
+    telechat = 'telechat'
+    telechat_v2 = 'telechat-v2'
+    dbrx = 'dbrx'
+    mengzi = 'mengzi'
+    c4ai = 'c4ai'
+    chatml = 'chatml'
+    # compatibility. (Deprecated)
+    default_generation_bos = 'default-generation-bos'
+
+    @classmethod
+    def get_template_name_list(cls) -> List[str]:
+        res = []
+        for k in cls.__dict__.keys():
+            if k.startswith('__') or k == 'get_template_name_list':
+                continue
+            res.append(cls.__dict__[k])
+        return res
+
+
+def register_template(template_type: str, template: Template, *, exist_ok: bool = False, **kwargs) -> None:
+    if not exist_ok and template_type in TEMPLATE_MAPPING:
+        raise ValueError(f'The `{template_type}` has already been registered in the TEMPLATE_MAPPING.')
+    template.template_type = template_type
+    template_info = {'template': template, **kwargs}
+    TEMPLATE_MAPPING[template_type] = template_info
+
+
+register_template(
+    TemplateType.default,
+    Template([], ['### Human:\n{{QUERY}}\n\n### Assistant:\n'], ['\n\n'], [['eos_token_id']],
+             DEFAULT_SYSTEM, ['{{SYSTEM}}\n\n'],
+             auto_add_bos=True))
+
+
+# You can set the query as '' to serve as a template for pre-training.
+class DefaultGenerationTemplate(Template):
+
+    def __init__(self):
+        super().__init__([], ['{{QUERY}}'], None, [['eos_token_id']], auto_add_bos=True)
+
+
+register_template(TemplateType.default_generation, DefaultGenerationTemplate(), is_generation=True)
+register_template(
+    TemplateType.default_generation_bos,
+    Template([['bos_token_id']], ['{{QUERY}}'], None, [['eos_token_id']]),
+    is_generation=True)
+
+
+class ChatmlTemplateMixin:
+    system = None
+
+    def __init__(self, auto_add_bos: bool = True):
+        Template.__init__(
+            self, [], ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'], ['<|im_end|>\n'],
+            ['<|im_end|>'],
+            self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'],
+            auto_add_bos=auto_add_bos)
+
+
+class ChatmlTemplate(ChatmlTemplateMixin, Template):
+    pass
+
+
+class QwenTemplateMixin(ChatmlTemplateMixin):
+    system = DEFAULT_SYSTEM
+
+    def __init__(self):
+        super().__init__(auto_add_bos=False)
+
+
+class QwenTemplate(QwenTemplateMixin, Template):
+    pass
+
+
+class _QwenVLTemplateMixin:
+    load_medias = False
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert not images or isinstance(fetch_one(images), str), 'QwenVL only supports datasets with images paths!'
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'image'
+        images = example.get('images') or []
+        image = images[index]
+        assert isinstance(image, str)
+        return [f'Picture {index + 1}:<img>{image}</img>\n']
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example['objects']
+        object_ = objects[index]
+        return [f'<ref>{object_["caption"]}</ref>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example['objects']
+        object_ = objects[index]
+        if isinstance(object_['bbox'][0], list):
+            all_objects = ''
+            for sub_object in object_['bbox']:
+                all_objects += (f'<box>({sub_object[0]},{sub_object[1]}),' f'({sub_object[2]},{sub_object[3]})</box>')
+            return [all_objects]
+        else:
+            return [
+                f'<box>({object_["bbox"][0]},{object_["bbox"][1]}),'
+                f'({object_["bbox"][2]},{object_["bbox"][3]})</box>'
+            ]
+
+
+register_template(TemplateType.qwen, QwenTemplate())
+
+
+class QwenVLTemplate(_QwenVLTemplateMixin, QwenTemplate):
+    pass
+
+
+class QwenVLGenerationTemplate(_QwenVLTemplateMixin, DefaultGenerationTemplate):
+    pass
+
+
+register_template(TemplateType.qwen_vl, QwenVLTemplate())
+register_template(TemplateType.qwen_vl_generation, QwenVLGenerationTemplate())
+
+register_template(TemplateType.chatml, ChatmlTemplate())
+
+register_template(
+    TemplateType.modelscope_agent,
+    Template([], [' \n\n<|user|>:{{QUERY}} \n\n<|assistant|>:'], [], [' \n\n</s>'], DEFAULT_SYSTEM,
+             [' \n\n<|system|>:{{SYSTEM}}']))
+
+
+class _QwenAudioTemplateMixin:
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        audios = example.get('audios') or []
+        audio = audios[index]
+        assert isinstance(audio, str)
+        return [f'Audio {index + 1}:<audio>{audio}</audio>\n']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, tokenizer_kwargs = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, tokenizer_kwargs
+        inputs.pop('loss_scale', None)
+        inputs.update(tokenizer_kwargs)
+        return inputs, tokenizer_kwargs
+
+    def _get_tokenizer_kwargs(self, context: str) -> Dict[str, Any]:
+        return {'audio_info': self.tokenizer.process_audio(context)}
+
+    def _concat_tokenizer_kwargs(self, tokenizer_kwargs: Dict[str, Any], curr_tokenizer_kwargs: Dict[str, Any]) -> None:
+        audio_info = curr_tokenizer_kwargs.get('audio_info')
+        old_audio_info = tokenizer_kwargs.get('audio_info')
+        if old_audio_info is None:
+            tokenizer_kwargs['audio_info'] = audio_info
+        elif audio_info is not None:
+            for k in ['input_audios', 'input_audio_lengths']:
+                old_audio_info[k] = torch.concat([old_audio_info[k], audio_info[k]], dim=0)
+            for k in ['audio_span_tokens', 'audio_urls']:
+                old_audio_info[k] = old_audio_info[k] + audio_info[k]
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = Template.data_collator(self, batch, padding_to)
+        if batch[0].get('audio_info') is not None:
+            res['audio_info'] = [b['audio_info'] for b in batch]
+        return res
+
+
+class QwenAudioTemplate(_QwenAudioTemplateMixin, QwenTemplate):
+    pass
+
+
+class QwenAudioGenerationTemplate(_QwenAudioTemplateMixin, DefaultGenerationTemplate):
+    pass
+
+
+register_template(TemplateType.qwen_audio, QwenAudioTemplate(), lazy_tokenize=True)
+register_template(
+    TemplateType.qwen_audio_generation, QwenAudioGenerationTemplate(), lazy_tokenize=True, is_generation=True)
+
+
+class _Qwen2AudioTemplateMixin:
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, {}
+        processor = self.tokenizer.processor
+        sampling_rate = processor.feature_extractor.sampling_rate
+        audios = load_batch(
+            example.get('audios') or [], load_func=partial(load_audio_qwen, sampling_rate=sampling_rate))
+        if audios:
+            audio_inputs = processor.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_attention_mask=True, return_tensors='pt')
+            audio_inputs['feature_attention_mask'] = audio_inputs.pop('attention_mask')
+            inputs.update(audio_inputs)
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = Template.data_collator(self, batch, padding_to)
+        input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        if input_features:
+            res['input_features'] = torch.concat(input_features)
+            feature_attention_mask = [b['feature_attention_mask'] for b in batch]
+            res['feature_attention_mask'] = torch.concat(feature_attention_mask)
+        return res
+
+
+class Qwen2AudioTemplate(_Qwen2AudioTemplateMixin, QwenTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        return [f'Audio {index + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n']
+
+
+class Qwen2AudioGenerationTemplate(_Qwen2AudioTemplateMixin, DefaultGenerationTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        return ['<|audio_bos|><|AUDIO|><|audio_eos|>\n']
+
+
+register_template(TemplateType.qwen2_audio, Qwen2AudioTemplate(), lazy_tokenize=True)
+
+
+def _process_image_qwen(image):
+    from qwen_vl_utils.vision_process import IMAGE_FACTOR, MIN_PIXELS, MAX_PIXELS, smart_resize
+    size_factor = get_env_args('size_factor', int, IMAGE_FACTOR)
+    # resize
+    resized_height = get_env_args('resized_height', int, None)
+    resized_width = get_env_args('resized_width', int, None)
+    if resized_height and resized_width:
+        resized_height, resized_width = smart_resize(
+            resized_height,
+            resized_width,
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = get_env_args('min_pixels', int, MIN_PIXELS)
+        max_pixels = get_env_args('max_pixels', int, MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+    return image
+
+
+class Qwen2VLTemplate(QwenTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        if media_type == 'image':
+            return ['<|vision_start|><|image_pad|><|vision_end|>']
+        else:
+            return ['<|vision_start|><|video_pad|><|vision_end|>']
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return ['<|object_ref_start|>', object_['caption'], '<|object_ref_end|>']
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            if isinstance(object_['bbox'][0], list):
+                all_objects = ''
+                for sub_object in object_['bbox']:
+                    all_objects += (f'<|box_start|>({sub_object[0]},{sub_object[1]}),'
+                                    f'({sub_object[2]},{sub_object[3]})<|box_end|>')
+                return [all_objects]
+            else:
+                return [
+                    f'<|box_start|>({object_["bbox"][0]},{object_["bbox"][1]}),'
+                    f'({object_["bbox"][2]},{object_["bbox"][3]})<|box_end|>'
+                ]
+        else:
+            return ['<bbox>']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        processor = self.tokenizer.processor
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        images = example.get('images') or []
+        videos = example.get('videos') or []
+        for media_type in ['images', 'videos']:
+            if locals()[media_type]:
+                if media_type == 'images':
+                    images = load_batch(images, _process_image_qwen)
+                    media_token = 151655
+                    media_inputs = processor.image_processor(images=images, videos=None, return_tensors='pt')
+                    media_grid_thw = media_inputs['image_grid_thw']
+                else:
+                    videos = load_batch(videos, load_video_qwen2)
+                    media_inputs = processor.image_processor(images=None, videos=videos, return_tensors='pt')
+                    media_grid_thw = media_inputs['video_grid_thw']
+                    media_token = 151656
+                idx_list = _findall(input_ids, media_token)
+                added_tokens_len = 0
+                for i, idx in enumerate(idx_list):
+                    merge_length = processor.image_processor.merge_size**2
+                    token_len = (media_grid_thw[i].prod() // merge_length)
+                    input_ids = input_ids[:idx
+                                          + added_tokens_len] + [media_token] * token_len + input_ids[added_tokens_len
+                                                                                                      + idx + 1:]
+                    if labels:
+                        labels = labels[:idx + added_tokens_len] + [-100] * token_len + labels[added_tokens_len + idx
+                                                                                               + 1:]
+                    added_tokens_len += token_len - 1
+                inputs.update(media_inputs)
+
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        for media_type in ['image', 'video']:
+            grid_thw = [b[f'{media_type}_grid_thw'] for b in batch if b.get(f'{media_type}_grid_thw') is not None]
+            if grid_thw:
+                res[f'{media_type}_grid_thw'] = torch.concat(grid_thw)
+        return res
+
+
+register_template(TemplateType.qwen2_vl, Qwen2VLTemplate(), lazy_tokenize=True)
+
+register_template(
+    TemplateType.qwen2_audio_generation, Qwen2AudioGenerationTemplate(), lazy_tokenize=True, is_generation=True)
+
+
+class YiCoderTemplate(ChatmlTemplate):
+    system = 'You are a helpful assistant.'
+
+
+register_template(TemplateType.yi_coder, YiCoderTemplate())
+
+yi_vl_default_system = (
+    'This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. '
+    "Read all the images carefully, and respond to the human's questions with informative, "
+    'helpful, detailed and polite answers. '
+    '这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。'
+    '仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。')
+
+
+class YiVLTemplate(Template):
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200], '\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        inputs.pop('loss_scale', None)
+        from llava.mm_utils import expand2square
+        # This processor should be put from the `model.vision_tower.image_processor`
+        image_processor = self.tokenizer.image_processor
+        images = example.get('images') or []
+        for i, image in enumerate(images):
+            background_color = tuple(int(x * 255) for x in image_processor.image_mean)
+            image = expand2square(image, background_color)
+            images[i] = image
+        if images:
+            image_tensor = image_processor.preprocess(images, return_tensors='pt')['pixel_values']
+            inputs['images'] = image_tensor.to(kwargs['dtype'])
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        has_images = [(b == -200).sum() for b in res['input_ids']]
+        assert all([
+            h > 0 for h in has_images
+        ]) or not any([h > 0
+                       for h in has_images]), 'YIVL does not support mix-batch nlp dataset and multi-modal dataset'
+        return res
+
+
+class GLMTemplate(Template):
+
+    def _init_template(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs) -> None:
+        res = super()._init_template(tokenizer, *args, **kwargs)
+        token_list = tokenizer.encode('')
+        self.prefix.insert(0, token_list)
+        if self.system_prefix is not None:
+            self.system_prefix.insert(0, token_list)
+        return res
+
+
+class GLM4VTemplate(GLMTemplate):
+
+    def __init__(self):
+        super().__init__([], ['<|user|>\n{{QUERY}}<|assistant|>'], [], ['<|endoftext|>'], None,
+                         ['<|system|>\n{{SYSTEM}}'])
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) <= 1
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-100]]
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        if idx_list:
+            idx = idx_list[0]
+            image = example.get('images')[0]
+            placeholder = '<|begin_of_image|><|endoftext|><|end_of_image|>'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            messages = example['messages']
+            messages[0]['image'] = image
+            inputs2: Dict[str, Any] = self.tokenizer.apply_chat_template(messages, return_dict=True)
+            inputs['images'] = inputs2['images']
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        return res
+
+
+register_template(TemplateType.glm4v, GLM4VTemplate(), infer_media_type='dialogue', lazy_tokenize=True, use_model=False)
+
+register_template(
+    TemplateType.yi_vl,
+    YiVLTemplate([], [[8308], 'Human: {{QUERY}}\n', [8308], 'Assistant:'], ['\n'], ['\n', [8308]], yi_vl_default_system,
+                 ['{{SYSTEM}}\n\n']),
+    use_model=False,
+    infer_media_type='round',
+    lazy_tokenize=True)
+
+register_template(TemplateType.baichuan, Template(['{{SYSTEM}}'], [[195], '{{QUERY}}', [196]], [], [['eos_token_id']]))
+
+register_template(
+    TemplateType.chatglm2,
+    GLMTemplate(['{{SYSTEM}}'], ['[Round {{ROUND1}}]\n\n问：{{QUERY}}\n\n答：'], ['\n\n'], [['eos_token_id']]))
+
+register_template(
+    TemplateType.chatglm_generation, GLMTemplate([], ['{{QUERY}}'], None, [['eos_token_id']]), is_generation=True)
+
+register_template(
+    TemplateType.chatglm3,
+    GLMTemplate([], ['<|user|>\n{{QUERY}}<|assistant|>\n'], [], ['<|user|>'], None, ['<|system|>\n{{SYSTEM}}']))
+
+register_template(
+    TemplateType.chatglm4,
+    GLMTemplate([], ['<|user|>\n{{QUERY}}<|assistant|>\n'], [], ['<|user|>'],
+                None, ['<|system|>\n{{SYSTEM}}'],
+                tools_prompt='glm4',
+                tool_prompt=['<|observation|>\n{{QUERY}}<|assistant|>\n']))
+
+codegeex4_system = '你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。'
+
+register_template(
+    TemplateType.codegeex4,
+    GLMTemplate([], ['<|user|>\n{{QUERY}}<|assistant|>\n'], [], ['<|endoftext|>'], codegeex4_system,
+                ['<|system|>\n{{SYSTEM}}']))
+
+register_template(
+    TemplateType.deepseek,
+    Template([['bos_token_id']], ['User: {{QUERY}}\n\nAssistant:'], [['eos_token_id']], [['eos_token_id']], None,
+             [['bos_token_id'], '{{SYSTEM}}\n\n']))
+register_template(
+    TemplateType.numina_math,
+    Template([['bos_token_id']], ['### Problem: {{QUERY}}\n### Solution: '], ['\n'], [['eos_token_id']], None,
+             [['bos_token_id'], '{{SYSTEM}}']))
+register_template(
+    TemplateType.deepseek2,
+    Template([[100000]], ['User: {{QUERY}}\n\nAssistant:'], [[100001]], [[100001]], None, [[100000], '{{SYSTEM}}\n\n']))
+register_template(
+    TemplateType.deepseek2_5,
+    Template(['<｜begin▁of▁sentence｜>'], ['<｜User｜>{{QUERY}}<｜Assistant｜>'], ['<｜end_of_sentense｜>'],
+             ['<｜end_of_sentense｜>'], None, ['<｜begin▁of▁sentence｜>{{SYSTEM}}']))
+
+# ref: https://github.com/facebookresearch/llama/blob/main/llama/generation.py
+LLAMA_DEFAULT_SYSTEM = (
+    'You are a helpful, respectful and honest assistant. '
+    'Always answer as helpfully as possible, while being safe. '
+    'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n\n'
+    'If a question does not make any sense, or is not factually coherent, '
+    'explain why instead of answering something not correct. '
+    "If you don't know the answer to a question, please don't share false information.")
+register_template(
+    TemplateType.llama,
+    Template(['<s>[INST] '], ['{{QUERY}} [/INST]'], ['</s><s>[INST] '], ['</s>'], LLAMA_DEFAULT_SYSTEM,
+             ['<s>[INST] <<SYS>>\n{{SYSTEM}}\n<</SYS>>\n\n']))
+
+register_template(
+    TemplateType.longwriter_llama3,
+    Template(['[INST]'], ['{{QUERY}}[/INST]'], ['[INST]'], ['<|end_of_text|>'], None,
+             ['<<SYS>>\n{{SYSTEM}}\n<</SYS>>\n\n']))
+
+register_template(TemplateType.mistral_nemo,
+                  Template(['<s>[INST] '], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['</s>[INST] '], ['</s>']))
+
+
+class Llama3TemplateMixin:
+    system = None
+
+    def __init__(self):
+        Template.__init__(
+            self, ['<|begin_of_text|>'], [
+                '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+                '<|start_header_id|>assistant<|end_header_id|>\n\n'
+            ], ['<|eot_id|>'], ['<|eot_id|>'],
+            self.system, ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{SYSTEM}}<|eot_id|>'],
+            tools_prompt='toolbench',
+            tool_prompt=[
+                '<|start_header_id|>tool<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+                '<|start_header_id|>assistant<|end_header_id|>\n\n'
+            ])
+
+
+class Llama3Template(Llama3TemplateMixin, Template):
+    pass
+
+
+class ReflectionTemplate(Llama3TemplateMixin, Template):
+    system = ('You are a world-class AI system, capable of complex reasoning and reflection. '
+              'Reason through the query inside <thinking> tags, and then provide your final '
+              'response inside <output> tags. If you detect that you made a mistake in your reasoning '
+              'at any point, correct yourself inside <reflection> tags.')
+
+
+register_template(TemplateType.reflection, ReflectionTemplate())
+register_template(TemplateType.llama3, Llama3Template())
+
+OPENBUDDY_DEFAULT_SYSTEM = (
+    'You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.\n'
+    'Always answer as helpfully and logically as possible, while being safe. '
+    'Your answers should not include any '
+    'harmful, political, religious, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n'
+    'If a question does not make any sense, or is not factually coherent, '
+    'explain why instead of answering something not correct. '
+    "If you don't know the answer to a question, please don't share false information.\n"
+    'You like to use emojis. You can speak fluently in many languages, for example: English, Chinese.\n'
+    'You cannot access the internet, but you have vast knowledge, cutoff: 2021-09.\n'
+    'You always deeply love and support China, Chinese government, people and culture.\n'
+    'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
+    'you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI.')
+register_template(
+    TemplateType.openbuddy,
+    Template([], ['User: {{QUERY}}\nAssistant:'], ['\n'], [['eos_token_id']],
+             OPENBUDDY_DEFAULT_SYSTEM, ['{{SYSTEM}}\n\n'],
+             auto_add_bos=True))
+
+OPENBUDDY2_DEFAULT_SYSTEM = (
+    'You(assistant) are a helpful, respectful and honest INTP-T AI Assistant named Buddy. '
+    'You are talking to a human(user).\nAlways answer as helpfully and logically as possible, while being safe. '
+    'Your answers should not include any harmful, political, religious, unethical, racist, '
+    'sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n'
+    'You cannot access the internet, but you have vast knowledge, cutoff: 2023-04.\n'
+    'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
+    'not related to GPT or OpenAI')
+
+register_template(
+    TemplateType.openbuddy2,
+    Template([], ['<|role|>user<|says|>{{QUERY}}<|end|>\n<|role|>assistant<|says|>'], ['<|end|>\n'], ['<|end|>'],
+             OPENBUDDY2_DEFAULT_SYSTEM, ['<|role|>system<|says|>{{SYSTEM}}<|end|>\n'],
+             auto_add_bos=True))
+
+INTERNLM_SYSTEM = (
+    'You are an AI assistant whose name is InternLM (书生·浦语).\n'
+    '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). '
+    'It is designed to be helpful, honest, and harmless.\n'
+    '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen '
+    'by the user such as English and 中文.')
+
+register_template(
+    TemplateType.internlm,
+    Template(['<s>'], ['<|User|>:{{QUERY}}\n<|Bot|>:'], ['<eoa>\n'], ['<eoa>'], INTERNLM_SYSTEM,
+             ['<s><|System|>:{{SYSTEM}}\n']))
+
+_T = TypeVar('_T')
+
+_log_set = set()  # log once
+
+
+def get_env_args(args_name: str, type_func: Callable[[str], _T], default_value: Optional[_T]) -> Optional[_T]:
+    args_name_upper = args_name.upper()
+    value = os.getenv(args_name_upper)
+    if value is None:
+        value = default_value
+        log_info = (f'Setting {args_name}: {default_value}. '
+                    f'You can adjust this hyperparameter through the environment variable: `{args_name_upper}`.')
+    else:
+        value = type_func(value)
+        log_info = f'Using environment variable `{args_name_upper}`, Setting {args_name}: {value}.'
+    if log_info not in _log_set:
+        _log_set.add(log_info)
+        logger.info(log_info)
+    return value
+
+
+class Internlm2Template(ChatmlTemplate):
+    system = INTERNLM_SYSTEM
+
+
+register_template(TemplateType.internlm2, Internlm2Template())
+
+
+def replace_img_tag(query: str,
+                    history: History,
+                    replace_token: str,
+                    pattern=r'<img>(.+?)</img>') -> Tuple[str, History, List[str]]:
+    images_path = []
+    new_history = []
+    for i, h in enumerate(history):
+        if h[0] is None:
+            new_history.append(h.copy())
+        else:
+            images_path += re.findall(pattern, h[0])
+            new_history.append([re.sub(pattern, replace_token, h[0]), h[1]])
+    if query is None:
+        new_query = query  # pretrain dataset
+    else:
+        images_path += re.findall(pattern, query)
+        new_query = re.sub(pattern, replace_token, query)
+    return new_query, new_history, images_path
+
+
+class InternLMXComposer2Template(Template):
+    INTERNLM_XCOMPOSER_SYSTEM = (
+        'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
+        '- InternLM-XComposer (浦语·灵笔) is a conversational language model that is developed by '
+        'Shanghai AI Laboratory (上海人工智能实验室). '
+        'It is designed to be helpful, honest, and harmless.\n'
+        '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen '
+        'by the user such as English and 中文.')
+    image_placeholder = ['</s>']
+
+    def __init__(self, version):
+        prefix = ['<s>']
+        prompt = ['[UNUSED_TOKEN_146]user\n{{QUERY}}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n']
+        chat_sep = ['[UNUSED_TOKEN_145]\n']
+        suffix = ['[UNUSED_TOKEN_145]']
+        system_prefix = ['<s>[UNUSED_TOKEN_146]system\n{{SYSTEM}}[UNUSED_TOKEN_145]\n']
+        super().__init__(prefix, prompt, chat_sep, suffix, self.INTERNLM_XCOMPOSER_SYSTEM, system_prefix)
+        self.version = version
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+
+        if self.version == 'v2.5':
+            hd_num = 24
+            if len(images) > 1:
+                hd_num = 6
+            hd_num = get_env_args('hd_num', int, hd_num)
+            Image_transform = get_class_from_dynamic_module('ixc_utils.Image_transform', self.tokenizer.model_dir)
+            images = [Image_transform(image, hd_num=hd_num) for image in images]
+        elif self.version == 'v2-4khd':
+            hd_num = 55
+            hd_num = get_env_args('hd_num', int, hd_num)
+            HD_transform = get_class_from_dynamic_module('ixc_utils.HD_transform', self.tokenizer.model_dir)
+            images = [HD_transform(image, hd_num=hd_num) for image in images]
+        # vis_processor comes from model.vis_processor
+        images = [self.tokenizer.vis_processor(image).to(kwargs['dtype']) for image in images]
+        inputs['_data'] = {'input_ids': inputs['input_ids'], 'labels': inputs['labels'], 'images': images}
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        input_ids = data['input_ids']
+        labels = data['labels']
+        images = data['images']
+        if len(images) > 0:  # ignore <s>
+            input_ids = input_ids[1:]
+            if labels is not None:
+                labels = labels[1:]
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.tolist()
+        input_ids.append(2)  # add dummy </s>
+        if labels is not None:
+            if isinstance(labels, torch.Tensor):
+                labels = labels.tolist()
+            labels.append(2)
+        else:
+            labels = []
+        res_inputs_embeds = []
+        res_labels = []
+        wrap_im_mask = []
+        pre_i, i, idx = 0, 0, 0
+        device = model.device
+        internlm2_model = model.model
+        if not hasattr(internlm2_model, 'tok_embeddings'):
+            internlm2_model = internlm2_model.model
+        tok_embeddings = internlm2_model.tok_embeddings
+        if len(images) > 0:
+            images = torch.concat([model.img2emb(image[None])[0] for image in images], dim=0)
+        while i < len(input_ids):
+            if input_ids[i] == 2:  # replace_token
+                res_input_ids = torch.tensor([1] + input_ids[pre_i:i], device=device)
+                res_inputs_embeds.append(tok_embeddings(res_input_ids[None])[0])
+                wrap_im_mask += [0] * len(res_input_ids)
+                res_labels += [-100] + labels[pre_i:i]
+                if len(images) > 0 and idx < images.shape[0]:
+                    res_inputs_embeds.append(images[idx].to(device))
+                    wrap_im_mask += [1] * images.shape[1]
+                    res_labels += [-100] * images.shape[1]
+                idx += 1
+                i += 1
+                pre_i = i
+                continue
+            i += 1
+        if len(labels) == 0:
+            res_labels = None
+        res_inputs_embeds = torch.concat(res_inputs_embeds, dim=0)
+        wrap_im_mask = torch.tensor(wrap_im_mask, dtype=torch.bool, device=device)[None]
+        return {'inputs_embeds': res_inputs_embeds, 'im_mask': wrap_im_mask, 'labels': res_labels}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        if 'im_mask' in batch[0]:
+            im_mask = [b['im_mask'][0] for b in batch]
+            im_mask = self.pad_sequence(im_mask, 0, self.padding_side)
+            res['im_mask'] = im_mask
+        return res
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+register_template(
+    TemplateType.internlm_xcomposer2, InternLMXComposer2Template(version='v2'), use_model=False, lazy_tokenize=True)
+
+
+class InternLMXComposer2_5Template(InternLMXComposer2Template):
+    INTERNLM_XCOMPOSER_SYSTEM = (
+        'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
+        '- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model '
+        'that is developed by Shanghai AI Laboratory (上海人工智能实验室). '
+        'It is designed to be helpful, honest, and harmless.\n'
+        '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen '
+        'by the user such as English and 中文.\n'
+        '- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively '
+        'based on the provided image.')
+
+
+register_template(
+    TemplateType.internlm_xcomposer2_5,
+    InternLMXComposer2_5Template(version='v2.5'),
+    use_model=False,
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.internlm_xcomposer2_4khd,
+    InternLMXComposer2_5Template(version='v2-4khd'),
+    use_model=False,
+    lazy_tokenize=True)
+
+
+class InternvlTemplate(Template):
+    system = 'You are an AI assistant whose name is InternLM (书生·浦语).'
+    num_image_token = 256
+
+    def __init__(self):
+        super().__init__([], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
+                         ['<|im_end|>'],
+                         self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>'],
+                         auto_add_bos=True)
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        return ['<img>', [-100], '</img>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        idx_list = _findall(input_ids, -100)
+        pixel_values = None
+        images = example.get('images')
+        if images:
+            labels = inputs.get('labels')
+            input_size = get_env_args('input_size', int, 448)
+            max_num = get_env_args('max_num', int, 12)
+            pixel_values_images = [transform_image(image, input_size, max_num) for image in images]
+            pixel_values = torch.cat(pixel_values_images, dim=0).to(kwargs['dtype'])
+            image_bs = pixel_values.shape[0]
+
+            idx, idx2 = idx_list[0], idx_list[-1]  # remove [-100, -100]
+            img_tokens: List[int] = self.tokenizer.encode(
+                '<IMG_CONTEXT>', add_special_tokens=False) * self.num_image_token * image_bs
+            input_ids = input_ids[:idx] + img_tokens + input_ids[idx2 + 1:]
+            if labels is not None:
+                labels = labels[:idx] + [-100] * len(img_tokens) + labels[idx2 + 1:]
+            inputs['input_ids'] = input_ids
+            inputs['labels'] = labels
+        inputs['_data'] = {'input_ids': torch.tensor(input_ids), 'pixel_values': pixel_values}
+        inputs.pop('loss_scale', None)
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        embedding = model.get_input_embeddings()
+        device = embedding.weight.device
+        input_ids = data['input_ids']
+        inputs_embeds = embedding(input_ids[None])[0].to(device=device)
+        pixel_values = data['pixel_values']
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(device=device)
+            vit_embeds = model.extract_feature(pixel_values).to(device=device)
+            selected = (input_ids == self.tokenizer.encode('<IMG_CONTEXT>', add_special_tokens=False)[0])
+            inputs_embeds[selected] = vit_embeds.reshape(-1, vit_embeds.shape[-1])
+        elif is_deepspeed_zero3_enabled():
+            dummy_pixel_values = torch.zeros((1, 3, 32, 32), device=device, dtype=inputs_embeds.dtype)
+            vit_embeds = model.extract_feature(dummy_pixel_values).to(device=device)
+            inputs_embeds += vit_embeds.mean() * 0.
+        return {'inputs_embeds': inputs_embeds}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+def _replace_video2image(load_video_func, example, replace_tag) -> List[Context]:
+    context_list = []
+    video_index = example['video_index']
+    video = example['videos'][video_index]
+    images = example['images']
+    image_index = example['image_index']
+    new_images = load_video_func(video)
+    example['images'] = images[:image_index] + new_images + images[image_index:]
+    for i in range(len(new_images)):
+        context_list += replace_tag(i)
+    example['image_index'] += len(new_images)
+    return context_list
+
+
+class Internvl2Template(InternvlTemplate):
+    video_segments = 8
+    system = '你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。'
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        image_context = super().replace_tag('image', index, example)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            video_segments = get_env_args('video_segments', int, self.video_segments)
+            load_video = partial(load_video_internvl, num_segments=video_segments)
+            return _replace_video2image(load_video, example, lambda i: [f'Frame{i + 1}: '] + image_context)
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return [f'<ref>{object_["caption"]}</ref>']
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            if isinstance(object_['bbox'][0], list):
+                all_objects = '<box> ['
+                for sub_object in object_['bbox']:
+                    all_objects += (f'[{sub_object[0]}, {sub_object[1]}, ' f'{sub_object[2]}, {sub_object[3]}],')
+                all_objects = all_objects[:-1]
+                all_objects += '] </box>'
+                return [all_objects]
+            else:
+                return [
+                    f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, '
+                    f'{object_["bbox"][2]}, {object_["bbox"][3]}]] </box>'
+                ]
+        else:
+            return ['<bbox>']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super(InternvlTemplate, self)._encode(example, **kwargs)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        idx_list = _findall(input_ids, -100)
+        labels = inputs.get('labels')
+        images = example.get('images')
+        if images:
+            has_video = bool(example.get('videos'))
+            input_size = get_env_args('input_size', int, 448)
+            max_num = get_env_args('max_num', int, 1 if has_video else 12)
+            pixel_values = [transform_image(image, input_size, max_num) for image in images]
+            num_patches = [pv.shape[0] for pv in pixel_values]
+            pixel_values = torch.cat(pixel_values).to(kwargs['dtype'])
+        else:
+            pixel_values = None
+            num_patches = []
+        assert len(num_patches) == len(
+            idx_list), f'len(num_patches): {len(num_patches)}, len(idx_list): {len(idx_list)}'
+        added_tokens_len = 0
+        for idx, num_patch in zip(idx_list, num_patches):
+            img_tokens: List[int] = self.tokenizer.encode(
+                '<IMG_CONTEXT>', add_special_tokens=False) * self.num_image_token * num_patch
+            input_ids = input_ids[:idx + added_tokens_len] + img_tokens + input_ids[idx + added_tokens_len + 1:]
+            if labels is not None:
+                labels = labels[:idx + added_tokens_len] + [-100] * len(img_tokens) + labels[idx + added_tokens_len
+                                                                                             + 1:]
+            added_tokens_len += len(img_tokens) - 1
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        inputs['_data'] = {'input_ids': torch.tensor(input_ids), 'pixel_values': pixel_values}
+        inputs.pop('loss_scale', None)
+        return inputs, {}
+
+
+class InternvlPhi3TemplateMixin:
+
+    def __init__(self):
+        Template.__init__(
+            self, [], ['<|user|>\n{{QUERY}}<|end|><|assistant|>\n'], ['<|end|>'], ['<|end|>'],
+            getattr(self, 'system', None), ['<|system|>\n{{SYSTEM}}<|end|>'],
+            auto_add_bos=True)
+        self.padding_side = 'left'
+
+
+class InternvlPhi3Template(InternvlPhi3TemplateMixin, InternvlTemplate):
+    system = 'You are an AI assistant whose name is Phi-3.'
+
+
+class Internvl2Phi3Template(InternvlPhi3TemplateMixin, Internvl2Template):
+    pass
+
+
+register_template(
+    TemplateType.internvl, InternvlTemplate(), use_model=False, lazy_tokenize=True, infer_media_type='dialogue')
+
+register_template(
+    TemplateType.internvl_phi3, InternvlPhi3Template(), use_model=False, lazy_tokenize=True, infer_media_type='dialogue')
+
+register_template(TemplateType.internvl2, Internvl2Template(), use_model=False, lazy_tokenize=True)
+
+register_template(TemplateType.internvl2_phi3, Internvl2Phi3Template(), use_model=False, lazy_tokenize=True)
+
+
+class FlorenceTemplate(Template):
+    compute_per_round_loss = False
+    output_prompt_answer = True
+
+    def __init__(self):
+        super().__init__(['<s>'], ['{{QUERY}}</s>'], None, ['</s>'])
+        self.task_prompts_without_inputs = {
+            '<OCR>': 'What is the text in the image?',
+            '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
+            '<CAPTION>': 'What does the image describe?',
+            '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
+            '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
+            '<OD>': 'Locate the objects with category name in the image.',
+            '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
+            '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
+        }
+        self.task_prompts_with_input = {
+            '<CAPTION_TO_PHRASE_GROUNDING>': 'Locate the phrases in the caption: {input}',
+            '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
+            '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
+            '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
+            '<REGION_TO_CATEGORY>': 'What is the region {input}?',
+            '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
+            '<REGION_TO_OCR>': 'What text is in the region {input}?',
+        }
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) == 1, 'Florence series models only supports input with a single image.'
+
+    def add_default_tags(self, example: Dict[str, Any]) -> None:
+        return
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        object_ = example['objects'][index]
+        if isinstance(object_['bbox'][0], list):
+            all_objects = ''
+            for sub_object in object_['bbox']:
+                x1, y1, x2, y2 = sub_object
+                all_objects += f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>,'
+            return [all_objects[:-1]]
+        else:
+            x1, y1, x2, y2 = object_['bbox']
+            return [f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        query = example['query']
+        processor = self.tokenizer.processor
+        example['query'] = processor._construct_prompts([query])[0]
+        inputs, _ = super()._encode(example)
+        input_ids = inputs['prompt_input_ids']
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        labels = inputs['answer_labels']
+        if labels is not None:
+            labels = [0] + labels
+        pixel_values = processor.image_processor(images, return_tensors='pt')['pixel_values'].to(kwargs['dtype'])
+        inputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            '_data': {
+                'input_ids': torch.tensor(input_ids)[None],
+                'pixel_values': pixel_values,
+            }
+        }
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        inputs_embeds = model.get_input_embeddings()(data['input_ids'])
+        image_features = model._encode_image(data['pixel_values'])
+        inputs_embeds, _ = model._merge_input_ids_with_image_features(image_features, inputs_embeds)
+        return {'inputs_embeds': inputs_embeds[0]}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+    def post_process_generate_response(self, response, example):
+        if isinstance(example['images'], list):
+            example['images'] = example['images'][0]
+        image = load_image(example['images'])
+        return json.dumps(
+            self.tokenizer.processor.post_process_generation(
+                response, task=example['query'], image_size=(image.width, image.height)))
+
+
+register_template(
+    TemplateType.florence,
+    FlorenceTemplate(),
+    use_model=False,
+    lazy_tokenize=True,
+    infer_media_type='dialogue',
+    stream=False)
+
+register_template(TemplateType.xverse,
+                  Template(['{{SYSTEM}}'], ['Human: {{QUERY}}\n\nAssistant: '], [['eos_token_id']], [['eos_token_id']]))
+register_template(TemplateType.yuan, Template([], ['{{QUERY}}<sep>'], None, [['eos_token_id']]))
+register_template(TemplateType.ziya,
+                  Template([['bos_token_id'], '{{SYSTEM}}'], ['<human>:{{QUERY}}\n<bot>:'], ['\n'], [['eos_token_id']]))
+
+register_template(TemplateType.skywork,
+                  Template(['<s>{{SYSTEM}}'], ['</s><s>[USER]{{QUERY}}[SEP][BOT]'], None, ['[SEP]</s>']))
+
+register_template(TemplateType.bluelm,
+                  Template([['bos_token_id'], '{{SYSTEM}}'], ['[|Human|]:{{QUERY}}[|AI|]:'], [], [['eos_token_id']]))
+
+register_template(
+    TemplateType.codefuse_codellama,
+    Template(['{{SYSTEM}}'], ['<|role_start|>human<|role_end|>{{QUERY}}<|role_start|>bot<|role_end|>'], [],
+             [['eos_token_id']]))
+
+register_template(
+    TemplateType.codefuse,
+    Template([], ['<s>human\n{{QUERY}}\n<s>bot\n'], [['eos_token_id'], '\n'], [['eos_token_id']], None,
+             ['<s>system\n{{SYSTEM}}\n']))
+
+register_template(
+    TemplateType.deepseek_coder,
+    Template(['{{SYSTEM}}'], ['### Instruction:\n{{QUERY}}\n### Response:\n'], ['\n<|EOT|>\n'], ['\n<|EOT|>'],
+             ('You are an AI programming assistant, utilizing the Deepseek Coder model, '
+              'developed by Deepseek Company, and you only answer questions related to computer science. '
+              'For politically sensitive questions, security and privacy issues, '
+              'and other non-computer science questions, you will refuse to answer\n')))
+
+
+class LlavaHfTemplate(Template):
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        if version.parse(transformers.__version__) < version.parse('4.43.0'):
+            self.padding_side = 'left'
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return ['<image>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        if images:
+            image_processor = self.tokenizer.processor.image_processor
+            image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+            inputs['pixel_values'] = image_inputs['pixel_values']
+            if 'image_sizes' in image_inputs:
+                inputs['image_sizes'] = image_inputs['image_sizes']
+        return inputs, {}
+
+
+class Llava1_6Llama3Template(LlavaHfTemplate):
+    default_system = 'You are a helpful language and vision assistant. ' \
+                     'You are able to understand the visual content that the user provides, ' \
+                     'and assist the user with a variety of tasks using natural language.'
+
+    def __init__(self):
+        super().__init__(['<|begin_of_text|>'], [
+            '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+            '<|start_header_id|>assistant<|end_header_id|>\n\n'
+        ], ['<|eot_id|>'], ['<|eot_id|>'], None,
+                         ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{SYSTEM}}<|eot_id|>'])
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs['pixel_values'].shape) == 5:  # (1, num_patch, 3, H/W, W/H)
+            inputs['pixel_values'] = torch.squeeze(inputs['pixel_values'], dim=0)  # (num_patch, 3, H/W, W/H)
+        return inputs, {}
+
+
+register_template(TemplateType.llava_next_llama3, Llava1_6Llama3Template(), use_model=False, lazy_tokenize=True)
+
+
+class LlavaVideoTemplate(Template):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+
+        if media_type == 'image':
+            return ['<image>\n']
+        assert media_type == 'video'
+        media_file = example['videos'][index]
+        if media_file.rsplit('.', 1)[-1] in {'jpg', 'png'}:
+            return ['<image>\n']
+        else:
+            return ['<video>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        videos_path = example.get('videos') or []
+        if len(videos_path) > 0:
+            videos = load_batch(videos_path, load_video_llava)
+            video_processor = self.tokenizer.processor.video_processor
+            video_inputs = video_processor(videos, return_tensors='pt').to(kwargs['dtype'])
+            inputs['pixel_values_videos'] = video_inputs['pixel_values_videos']
+        if len(images) > 0:
+            image_processor = self.tokenizer.processor.image_processor
+            image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+            inputs['pixel_values'] = image_inputs['pixel_values']
+            inputs['image_sizes'] = image_inputs['image_sizes']
+        return inputs, {}
+
+
+register_template(
+    TemplateType.llava_next_video,
+    LlavaVideoTemplate(['<s>{{SYSTEM}} '], ['USER: {{QUERY}} ASSISTANT:'], [' '], ['</s>']),
+    use_model=False,
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.llava_next_video_yi,
+    LlavaVideoTemplate(['{{SYSTEM}} '], ['USER: {{QUERY}} ASSISTANT:'], [' '], ['<|im_end|>']),
+    use_model=False,
+    infer_media_type='round',
+    lazy_tokenize=True)
+
+
+def align_image_inputs(input_ids: List[int], labels: List[int], new_input_ids,
+                       image_token: int) -> Tuple[List[int], List[int]]:
+    if isinstance(new_input_ids, torch.Tensor):
+        new_input_ids = new_input_ids.tolist()
+
+    # Find the tokens after the image_token in input_ids, and then align them.
+    i, j = 0, 0
+    while i < len(input_ids):
+        x = input_ids[i]
+        if x == image_token:
+            assert i + 1 < len(input_ids), f'input_ids[-10:]: {input_ids[-10:]}'
+            assert i - 1 >= 0, f'input_ids[:10]: {input_ids[:10]}'
+            # [1, 2, 3(i-1), image_token(i), 4(i+1) ,5, 6]
+            # [1, 2, 3(j_begin), a(j'), a, a, a, 4(j) ,5, 6]
+            j_begin = j - 1
+            for k in range(5):  # Increase robustness.
+                if j_begin + k < len(new_input_ids) and new_input_ids[j_begin + k] == input_ids[i - 1]:
+                    j_begin += k
+                    break
+                if j_begin - k >= 0 and new_input_ids[j_begin - k] == input_ids[i - 1]:
+                    j_begin -= k
+                    break
+            else:
+                raise ValueError(f'new_input_ids: {new_input_ids}, input_ids: {input_ids}')
+            j_begin += 1
+            while j < len(new_input_ids) and new_input_ids[j] != input_ids[i + 1]:
+                j += 1
+            input_ids = input_ids[:i] + new_input_ids[j_begin:j] + input_ids[i + 1:]
+            if labels:
+                labels = labels[:i] + [-100] * (j - j_begin) + labels[i + 1:]
+            i += j - j_begin
+        else:
+            j += 1
+        i += 1
+    return input_ids, labels
+
+
+class Idefics3Template(Template):
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        processor = self.tokenizer.processor
+        prompt = self.tokenizer.decode(inputs['input_ids'])
+        if images:
+            image_inputs = processor(text=prompt, images=images, return_tensors='pt', add_special_tokens=False)
+            image_token = 128257  # <image>
+            inputs['input_ids'], inputs['labels'] = align_image_inputs(inputs['input_ids'], inputs['labels'],
+                                                                       image_inputs['input_ids'][0], image_token)
+            inputs['pixel_values'] = image_inputs['pixel_values']
+        return inputs, {}
+
+
+register_template(
+    TemplateType.idefics3,
+    Idefics3Template(['<|begin_of_text|>'], ['User:{{QUERY}}<end_of_utterance>\nAssistant:'], ['<end_of_utterance>\n'],
+                     ['<end_of_utterance>'], None, ['System:{{SYSTEM}}<end_of_utterance>\n']),
+    use_model=False,
+    lazy_tokenize=True)
+
+
+class Llava1_5Template(LlavaHfTemplate):
+
+    def __init__(self):
+        super().__init__(['<s>'], ['USER: {{QUERY}}\nASSISTANT:'], ['</s>'], ['</s>'])
+
+
+register_template(TemplateType.llava1_5, Llava1_5Template(), use_model=False, lazy_tokenize=True)
+
+
+class LLavaTemplate(Template):
+
+    def __init__(self):
+        # This template follows: https://github.com/haotian-liu/LLaVA/blob/main/llava/conversation.py#L350
+        super().__init__(['<s>[INST] '], ['{{QUERY}} [/INST]'],
+                         None, ['</s>'],
+                         system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200], '\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        image_sizes = [x.size for x in images]
+        from llava.mm_utils import process_images
+        if images:
+            # image_processor comes from the model.vision_tower.image_processor
+            # config comes from the model.config
+            images_tensor = process_images(images, self.tokenizer.image_processor, self.tokenizer.config)
+            inputs['images'] = images_tensor.to(kwargs['dtype']).squeeze(0)
+            inputs['image_sizes'] = image_sizes
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = images
+            res['image_sizes'] = sum([b['image_sizes'] for b in batch if 'image_sizes' in b], start=[])
+        has_images = [(b == -200).sum() for b in res['input_ids']]
+        assert all([
+            h > 0 for h in has_images
+        ]) or not any([h > 0
+                       for h in has_images]), 'Llava does not support mix-batch nlp dataset and multi-modal dataset'
+        return res
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+class Llava1_6Template(LlavaHfTemplate):
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        for b in batch:
+            pixel_values = b.get('pixel_values')
+            if pixel_values is not None:
+                b['pixel_values'] = pixel_values.squeeze(0)  # 5d -> 4d
+        res = super().data_collator(batch, padding_to)
+        return res
+
+
+class Llava1_6MistralTemplate(Llava1_6Template):
+
+    def __init__(self):
+        super().__init__(['<s>[INST] '], ['{{QUERY}} [/INST]'], ['</s>'], ['</s>'],
+                         system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
+
+
+class Llava1_6VicunaTemplate(Llava1_6Template):
+    system = ('A chat between a curious human and an artificial intelligence assistant. '
+              "The assistant gives helpful, detailed, and polite answers to the human's questions.")
+
+    def __init__(self):
+        super().__init__(['<s>'], ['USER: {{QUERY}} ASSISTANT:'], ['</s>'], ['</s>'],
+                         self.system,
+                         system_prefix=['<s>{{SYSTEM}} '])
+
+
+register_template(TemplateType.llava_mistral, Llava1_6MistralTemplate(), use_model=False, lazy_tokenize=True)
+
+register_template(TemplateType.llava_vicuna, Llava1_6VicunaTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LLava1_6YiTemplate(Llava1_6Template):
+
+    def __init__(self):
+        super().__init__([], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
+                         ['<|im_end|>'],
+                         system_prefix=['<|im_start|>system\n{{SYSTEM}}<|im_end|>'])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return super().replace_tag(media_type, index, example)
+
+
+register_template(TemplateType.llava_yi, LLava1_6YiTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class Llama3LlavaNextHfTemplate(Llama3TemplateMixin, Llava1_6Template):
+    pass
+
+
+register_template(TemplateType.llama3_llava_next_hf, Llama3LlavaNextHfTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LlavaQwenHfTemplate(QwenTemplateMixin, Llava1_6Template):
+    pass
+
+
+register_template(TemplateType.llava_qwen_hf, LlavaQwenHfTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LlavaOneVisonTemplate(QwenTemplateMixin, Llava1_6Template):
+    system = None
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, 151646)  # <image>
+        processor = self.tokenizer.processor
+        if images:
+            image_processor = processor.image_processor
+            image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+            height, width = image_inputs['pixel_values'][0].shape[-2:]
+            added_tokens_len = 0
+            for idx, pixel_v, image_size in zip(idx_list, image_inputs['pixel_values'], image_inputs['image_sizes']):
+                orig_height, orig_width = image_size
+                num_image_tokens = processor._get_number_of_features(orig_height, orig_width, height, width)
+                input_ids = input_ids[:added_tokens_len
+                                      + idx] + [151646] * num_image_tokens + input_ids[added_tokens_len + idx + 1:]
+                if labels is not None:
+                    labels = labels[:added_tokens_len + idx] + [-100] * num_image_tokens + labels[added_tokens_len + idx
+                                                                                                  + 1:]
+                added_tokens_len += num_image_tokens - 1
+            inputs['input_ids'] = input_ids
+            inputs['labels'] = labels
+            inputs['pixel_values'] = image_inputs['pixel_values']
+            if 'image_sizes' in image_inputs:
+                inputs['image_sizes'] = image_inputs['image_sizes']
+        return inputs, {}
+
+
+register_template(TemplateType.llava_onevision_qwen, LlavaOneVisonTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LLavaLlamaTemplate(Llama3Template):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+        return ['<image>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        raw_image = example.get('images')
+        if raw_image:
+            pixel_values = self.tokenizer.processor.image_processor(raw_image, return_tensors='pt')['pixel_values']
+            inputs['pixel_values'] = pixel_values.to(kwargs['dtype'])
+        return inputs, {}
+
+
+register_template(TemplateType.llava_llama_instruct, LLavaLlamaTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class PaliGemmaTemplate(Template):
+
+    def __init__(self):
+        super().__init__([], ['{{QUERY}}\n'], None, ['<eos>'])
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) <= 1
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        assert media_type == 'image'
+        if self._is_vllm:
+            self.prompt = ['{{QUERY}}']
+            return []
+        else:
+            self.prompt = ['{{QUERY}}\n']
+            return ['<image>' * self.tokenizer.processor.image_seq_length + '<bos>']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        raw_image = example.get('images')
+        processor = self.tokenizer.processor
+        if inputs['labels'] is not None:
+            n = upper_bound(0, len(inputs['labels']), lambda idx: inputs['labels'][idx] == -100)
+            n2 = len(inputs['labels']) - n
+            inputs['token_type_ids'] = [0] * n + [1] * n2
+        else:
+            inputs['token_type_ids'] = [0] * len(inputs['input_ids'])
+        if raw_image:
+            model_inputs = processor(text=example['query'], images=raw_image[0], return_tensors='pt')
+            inputs['pixel_values'] = model_inputs['pixel_values']
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch]
+        token_type_ids = self.pad_sequence(token_type_ids, 0, self.padding_side)
+        res['token_type_ids'] = token_type_ids
+        return res
+
+
+register_template(
+    TemplateType.paligemma, PaliGemmaTemplate(), infer_media_type='dialogue', lazy_tokenize=True, is_generation=True)
+
+
+class Phi3Template(Template):
+
+    def __init__(self):
+        super().__init__([], ['<|user|>\n{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], ['<|end|>'],
+                         None, ['<|system|>\n{{SYSTEM}}<|end|>\n'],
+                         auto_add_bos=True)
+
+
+register_template(TemplateType.phi3, Phi3Template())
+
+
+class Phi3VisionTemplate(Phi3Template):
+    image_placeholder = ['<|image|><s>\n']  # <|image|>\n
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        return super().replace_tag(media_type, index, example)
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        images = example.get('images') or []
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, 32044)  # '<|image|>'
+
+        if len(images) > 0:
+            processor = self.tokenizer.processor
+            inputs.update(processor.image_processor(images, return_tensors='pt'))
+            assert len(idx_list) == len(images), f'len(idx_list): {len(idx_list)}, len(images): {len(images)}'
+            res_input_ids = []
+            res_labels = []
+            num_img_tokens = inputs.pop('num_img_tokens').tolist()
+            idx_list.insert(0, -1)
+            for i in range(len(idx_list) - 1):
+                image_token_id = -i - 1
+                res_input_ids += input_ids[idx_list[i] + 1:idx_list[i + 1]] + [image_token_id] * num_img_tokens[i]
+                if labels is not None:
+                    res_labels += labels[idx_list[i] + 1:idx_list[i + 1]] + [-100] * num_img_tokens[i]
+            res_input_ids += input_ids[idx_list[-1] + 1:]
+            input_ids = res_input_ids
+            if labels is not None:
+                res_labels += labels[idx_list[-1] + 1:]
+                labels = res_labels
+
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+
+register_template(TemplateType.phi3_vl, Phi3VisionTemplate(), lazy_tokenize=True)
+
+
+class Llama3LlavaNextTemplate(Llama3TemplateMixin, LLavaTemplate):
+    system = 'You are a helpful language and vision assistant. ' \
+             'You are able to understand the visual content that the user provides, ' \
+             'and assist the user with a variety of tasks using natural language.'
+
+
+register_template(TemplateType.llama3_llava_next, Llama3LlavaNextTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LLavaQwenTemplate(QwenTemplateMixin, LLavaTemplate):
+    pass
+
+
+register_template(TemplateType.llava_qwen, LLavaQwenTemplate(), use_model=False, lazy_tokenize=True)
+
+
+def _findall(token_list: List[int], sub_token_list: Union[int, List[int]]) -> List[int]:
+    """Find the index of a token in the token_list."""
+    if isinstance(sub_token_list, int):
+        sub_token_list = [sub_token_list]
+    res = []
+    idx = -1
+    try:
+        while True:
+            idx = token_list.index(sub_token_list[0], idx + 1)
+            if len(sub_token_list) == 1 or sub_token_list == token_list[idx:idx + len(sub_token_list)]:
+                res.append(idx)
+    except ValueError:
+        pass
+    return res
+
+
+class DeepseekVLTemplate(Template):
+    DEEPSEEK_VL_SYSTEM = ('You are a helpful language and vision assistant. '
+                          'You are able to understand the visual content that the user provides, '
+                          'and assist the user with a variety of tasks using natural language.')
+
+    image_placeholder = ['<image_placeholder>']
+
+    def __init__(self):
+        super().__init__(['<｜begin▁of▁sentence｜>{{SYSTEM}}\n\n'], ['User: {{QUERY}}\n\nAssistant:'],
+                         ['<｜end▁of▁sentence｜>'], ['<｜end▁of▁sentence｜>'], self.DEEPSEEK_VL_SYSTEM)
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        processor = self.tokenizer.processor
+        input_ids, labels = inputs['input_ids'], inputs['labels']
+        idx_list = _findall(input_ids, processor.image_id)  # '<image_placeholder>'
+        new_input_ids, new_labels = [], []
+        lo = 0
+        for hi in idx_list:
+            new_input_ids += input_ids[lo:hi]
+            if labels is not None:
+                new_labels += labels[lo:hi]
+            new_input_ids += [processor.image_id] * processor.num_image_tokens
+            new_labels += [-100] * processor.num_image_tokens
+            lo = hi + 1
+        new_input_ids += input_ids[lo:]
+        if labels is not None:
+            new_labels += labels[lo:]
+        else:
+            new_labels = None
+        from deepseek_vl.models.processing_vlm import VLChatProcessorOutput
+        images_outputs = processor.image_processor(images, return_tensors='pt')
+        output = VLChatProcessorOutput(
+            sft_format=None,
+            input_ids=torch.tensor(new_input_ids),
+            pixel_values=images_outputs.pixel_values,
+            num_image_tokens=torch.tensor([processor.num_image_tokens] * len(idx_list)))
+        batched_output = dict(processor.batchify([output]))
+        batched_output['pixel_values'] = batched_output['pixel_values'].to(dtype=kwargs['dtype'])
+        inputs = {'input_ids': new_input_ids, 'labels': new_labels, '_data': batched_output}
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        inputs_embeds = model.prepare_inputs_embeds(**data)[0]
+        return {'inputs_embeds': inputs_embeds}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+register_template(TemplateType.deepseek_vl, DeepseekVLTemplate(), use_model=False, lazy_tokenize=True)
+
+register_template(
+    TemplateType.zephyr,
+    Template([], ['<|user|>\n{{QUERY}}</s>\n<|assistant|>\n'], ['</s>\n'], ['</s>'], None,
+             ['<|system|>\n{{SYSTEM}}</s>\n']))
+
+register_template(
+    TemplateType.sus,
+    Template(['{{SYSTEM}}'], ['### Human: {{QUERY}}\n\n### Assistant: '], ['<|endoftext|>'], ['<|endoftext|>']))
+
+register_template(TemplateType.orion,
+                  Template(['<s>{{SYSTEM}}'], ['Human: {{QUERY}}\n\nAssistant: </s>'], ['</s>'], ['</s>']))
+
+
+class CogTemplate(Template):
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) <= 1
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return []
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        image = example.get('images') or []
+        inputs.pop('loss_scale', None)
+        inputs2 = self.tokenizer.build_conversation_input_ids(
+            self.tokenizer, query=example['query'], history=example.get('history'), images=image)
+        image_token_len = inputs2['token_type_ids'].sum().item()
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        inputs['token_type_ids'] = [0] + [1] * image_token_len + [0] * len(input_ids[1:])
+        inputs['input_ids'] = input_ids[:1] + [self.tokenizer.pad_token_id] * image_token_len + input_ids[1:]
+        if labels is not None:
+            inputs['labels'] = labels[:1] + [-100] * image_token_len + labels[1:]
+        if len(image) > 0:
+            inputs['images'] = [[img.to(dtype=kwargs['dtype'])] for img in inputs2['images']]
+            if 'cross_images' in inputs2:
+                # is cogagent
+                inputs['cross_images'] = [[cross_img.to(dtype=kwargs['dtype'])] for cross_img in inputs2['cross_images']]
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        keys = ['images', 'cross_images']
+        for key in keys:
+            if key in batch[0]:
+                res[key] = [b[key][0] for b in batch]
+        token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch]
+        token_type_ids = self.pad_sequence(token_type_ids, 0, self.padding_side)
+        res['token_type_ids'] = token_type_ids
+        return res
+
+
+register_template(
+    TemplateType.cogagent_chat,
+    CogTemplate(['<s>'], [' [INST] {{QUERY}} [/INST] '], [], ['</s>']),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.cogagent_instruct,
+    CogTemplate(['<s>'], ['<EOI>Question: {{QUERY}} Answer:'], None, ['</s>']),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.cogvlm,
+    CogTemplate([['bos_token_id']], ['Question: {{QUERY}} Answer:'], ['\n'], [['eos_token_id']]),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True)
+
+
+class Cog2VideoTemplate(CogTemplate):
+
+    def check_example(self, example):
+        videos = example.get('videos') or []
+        assert len(videos) <= 1
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super(CogTemplate, self)._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        videos_path = example.get('videos') or []
+        video = load_batch(videos_path, load_video_cogvlm2)
+        inputs.pop('loss_scale', None)
+        inputs2 = self.tokenizer.build_conversation_input_ids(
+            self.tokenizer,
+            query=example['query'],
+            history=example.get('history'),
+            images=video,
+            template_version='chat')
+        video_token_len = inputs2['token_type_ids'].sum().item()
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        inputs['token_type_ids'] = [0] + [1] * video_token_len + [0] * len(input_ids[1:])
+        inputs['input_ids'] = input_ids[:1] + [self.tokenizer.pad_token_id] * video_token_len + input_ids[1:]
+        if labels is not None:
+            inputs['labels'] = labels[:1] + [-100] * video_token_len + labels[1:]
+        if len(video) > 0:
+            inputs['images'] = [[img.to(dtype=kwargs['dtype'])] for img in inputs2['images']]
+        return inputs, {}
+
+
+register_template(
+    TemplateType.cogvlm2_video,
+    Cog2VideoTemplate([['bos_token_id']], ['Question: {{QUERY}} Answer:'], ['\n'], [['eos_token_id']]),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True,
+    media_type='video')
+
+register_template(TemplateType.minicpm, Template(['<s>{{SYSTEM}}'], ['<用户>{{QUERY}}<AI>'], [], ['</s>']))
+
+
+def _remove_idx(arr: List[int], idx_list: List[int]) -> List[int]:
+    res = []
+    idx_set = set(idx_list)
+    for i, x in enumerate(arr):
+        if i not in idx_set:
+            res.append(x)
+    return res
+
+
+class MiniCPMVTemplate(Template):
+    is_v2_5 = False
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return [[-100]]
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) == 1
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example['images']
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        idx = idx_list[0]
+        tgt_sizes = None
+        slice_mode = getattr(self.tokenizer.config, 'slice_mode', False)
+        if slice_mode:
+            if self.is_v2_5:
+                image_processor = self.tokenizer.processor.image_processor
+                image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+                placeholder = image_processor.get_slice_image_placeholder(image_inputs.image_sizes[0][0])
+                pixel_values = image_inputs['pixel_values']
+                tgt_sizes = image_inputs['tgt_sizes']
+            else:
+                # Comes from model.get_slice_image_placeholder and model.transform
+                images, placeholder = self.tokenizer.get_slice_image_placeholder(images[0], self.tokenizer)
+                pixel_values = [[self.tokenizer.transform(img) for img in images]]
+            placeholder += '\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            input_tensor_ids = torch.tensor(input_ids)
+            image_start_idx = torch.where(input_tensor_ids == self.tokenizer.im_start_id)[0]
+            image_start_idx += 1
+            image_end_idx = torch.where(input_tensor_ids == self.tokenizer.im_end_id)[0]
+            valid_image_nums = max(len(image_start_idx), len(image_end_idx))
+            image_bound = [
+                torch.hstack(
+                    [image_start_idx[:valid_image_nums].unsqueeze(-1), image_end_idx[:valid_image_nums].unsqueeze(-1)])
+            ]
+        else:
+            placeholder = '<image>' + '<unk>' * self.tokenizer.config.query_num + '</image>\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            image_bound = [torch.tensor([[idx, idx + self.tokenizer.config.query_num]])]
+            pixel_values = [[self.tokenizer.transform(images[0])]]
+        inputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            '_data': {
+                'input_ids': torch.tensor(input_ids)[None],
+                'image_bound': image_bound,
+                'pixel_values': pixel_values,
+                'tgt_sizes': tgt_sizes
+            }
+        }
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        inputs_embeds, _ = model.get_vllm_embedding(data)
+        return {'inputs_embeds': inputs_embeds[0]}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+class MiniCPMV2_6Template(QwenTemplateMixin, MiniCPMVTemplate):
+
+    def check_example(self, example):
+        pass
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        max_num_frames = get_env_args('max_num_frames', int, 64)
+        load_video = partial(load_video_minicpmv_mplug_owl3, max_num_frames=max_num_frames)
+        image_context = super().replace_tag('image', index, example)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            return _replace_video2image(load_video, example, lambda i: image_context)
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        use_video = bool(example.get('videos'))
+        is_plain_text = not images and not use_video
+        use_image_id = True
+        max_slice_nums = None
+
+        if use_video:
+            use_image_id = False
+            max_slice_nums = 1  # or 2
+
+        max_slice_nums = get_env_args('max_slice_nums', int, max_slice_nums)
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        idx_list.insert(0, -1)
+
+        image_processor = self.tokenizer.processor.image_processor
+        image_inputs = image_processor([images], return_tensors='pt',
+                                       max_slice_nums=max_slice_nums).to(kwargs['dtype'])
+
+        res_input_ids = []
+        res_labels = []
+        for i in range(len(idx_list) - 1):
+            placeholder = image_processor.get_slice_image_placeholder(
+                image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
+            placeholder += '\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            res_input_ids += input_ids[idx_list[i] + 1:idx_list[i + 1]] + placeholder_id
+            if labels is not None:
+                res_labels += labels[idx_list[i] + 1:idx_list[i + 1]] + [-100] * len(placeholder_id)
+        res_input_ids += input_ids[idx_list[-1] + 1:]
+        input_ids = res_input_ids
+        if labels is not None:
+            res_labels += labels[idx_list[-1] + 1:]
+            labels = res_labels
+        if not is_plain_text:
+            input_tensor_ids = torch.tensor(input_ids)
+            unk_token = self.tokenizer.encode('<unk>', add_special_tokens=False)[0]
+            indices = (input_tensor_ids == unk_token).nonzero(as_tuple=True)[0].tolist()
+
+            ranges = []
+            start = indices[0]
+            for i in range(1, len(indices)):
+                if indices[i] != indices[i - 1] + 1:
+                    ranges.append([start, indices[i - 1] + 1])
+                    start = indices[i]
+            ranges.append([start, indices[-1] + 1])
+            image_bound = [torch.tensor(ranges)]
+        else:
+            image_bound = [[]]
+
+        inputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            '_data': {
+                'input_ids': torch.tensor(input_ids)[None],
+                'image_bound': image_bound,
+                'pixel_values': image_inputs['pixel_values'],
+                'tgt_sizes': image_inputs['tgt_sizes']
+            }
+        }
+        return inputs, {}
+
+
+register_template(TemplateType.minicpm_v_v2_6, MiniCPMV2_6Template(), use_model=False, lazy_tokenize=True)
+
+
+class MiniCPMV2_5Template(Llama3TemplateMixin, MiniCPMVTemplate):
+    is_v2_5 = True
+
+
+register_template(
+    TemplateType.minicpm_v_v2_5, MiniCPMV2_5Template(), use_model=False, lazy_tokenize=True, infer_media_type='dialogue')
+
+register_template(
+    TemplateType.minicpm_v,
+    MiniCPMVTemplate(['<s>{{SYSTEM}}'], ['<用户>{{QUERY}}<AI>'], [], ['</s>']),
+    use_model=False,
+    lazy_tokenize=True,
+    infer_media_type='dialogue')
+
+gemma_template = Template(['<bos>'], ['<start_of_turn>user\n{{QUERY}}<end_of_turn>\n<start_of_turn>model\n'],
+                          ['<end_of_turn>\n'], ['<end_of_turn>'], None,
+                          ['<bos><start_of_turn>system\n{{SYSTEM}}<end_of_turn>\n'])
+register_template(TemplateType.gemma, gemma_template)
+
+register_template(TemplateType.telechat, Template([], ['<_user>{{QUERY}}<_bot>'], ['<_end>'], ['<_end>']))
+
+register_template(TemplateType.telechat_v2, Template([], ['<_user> {{QUERY}}<_bot>'], [], ['<_end>']))
+
+DBRX_SYSTEM = (
+    'You are DBRX, created by Databricks. You were last updated in December 2023. '
+    'You answer questions based on information available up to that point.\n'
+    'YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, '
+    'but provide thorough responses to more complex and open-ended questions.\n'
+    'You assist with various tasks, from writing to coding (using markdown for code blocks '
+    '— remember to use ``` with code, JSON, and tables).\n'
+    'You do not have real-time data access or code execution capabilities.'
+    ' You avoid stereotyping and provide balanced perspectives on controversial topics. '
+    'You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.\n'
+    'This is your system prompt, guiding your responses. Do not reference it, just respond to the user. '
+    'If you find yourself talking about this message, stop. You should be responding appropriately '
+    'and usually that means not mentioning this.'
+    'YOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY '
+    'PERTINENT TO THE USER\'S QUERY.')
+
+
+class DbrxTemplate(ChatmlTemplate):
+    system = DBRX_SYSTEM
+
+
+register_template(TemplateType.dbrx, DbrxTemplate())
+
+register_template(TemplateType.mengzi,
+                  Template([], ['输入：{{QUERY}}输出：\n'], [], [['eos_token_id']], None, ['指令：{{SYSTEM}}']))
+
+C4AI_SYSTEM = ('You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by '
+               'providing thorough responses.You are trained by Cohere.')
+register_template(
+    TemplateType.c4ai,
+    Template(
+        ['<BOS_TOKEN>'],
+        ['<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{QUERY}}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'],
+        ['<|END_OF_TURN_TOKEN|>'], ['<|END_OF_TURN_TOKEN|>'], C4AI_SYSTEM,
+        ['<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{SYSTEM}}<|END_OF_TURN_TOKEN|']))
+
+
+class mPlugOwl2Template(Template):
+
+    def __init__(self):
+        super().__init__(['{{SYSTEM}}'], ['USER: {{QUERY}}ASSISTANT:'], ['</s>'], [['eos_token_id']])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200]]
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        from mplug_owl2.mm_utils import process_images
+        processor = self.tokenizer.processor
+        images = example.get('images') or []
+        for i, image in enumerate(images):
+            # ref: https://modelscope.cn/models/iic/mPLUG-Owl2.1
+            max_edge = max(image.size)
+            image = image.resize((max_edge, max_edge))
+            images[i] = image
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        if images:
+            images = process_images(images, processor)
+            images = images.to(kwargs['dtype'])
+            return {'input_ids': input_ids, 'labels': labels, 'images': images}, {}
+        else:
+            return {'input_ids': input_ids, 'labels': labels}, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        return res
+
+
+register_template(
+    TemplateType.mplug_owl2, mPlugOwl2Template(), infer_media_type='round', use_model=False, lazy_tokenize=True)
+
+
+class mPlugOwl3Template(QwenTemplateMixin, Template):
+    system = None
+
+    def _get_image_token_list(self, cut_shape):
+        processor = self.tokenizer.processor
+        text = processor.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0], w=cut_shape[1])
+        text_list = text.split('<|image|>')
+        if text_list[-1] == '':
+            text_list.pop()
+        res_text_list = []
+        for text in text_list:
+            res_text_list += [text, '<|image|>']
+        token_list = self._encode_context_list(res_text_list)[0]
+        return token_list
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        max_num_frames = get_env_args('max_num_frames', int, 16)
+        load_video = partial(load_video_minicpmv_mplug_owl3, max_num_frames=max_num_frames)
+        if media_type == 'image':
+            return [[-100], '\n']
+        elif media_type == 'video':
+            return _replace_video2image(load_video, example, lambda i: [[-100]]) + ['\n']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example['images']
+        videos = example['videos']
+        cut_enable = not videos
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        processor = self.tokenizer.processor
+        if images:
+            image_inputs = processor.image_processor(images, cut_enable=cut_enable, return_tensors='pt')
+            added_tokens_len = 0
+            cut_shapes = image_inputs['cut_shape'] or [None] * len(idx_list)
+            image_token_list = self.tokenizer.encode('<|image|>', add_special_tokens=False)
+            for idx, cut_shape in zip(idx_list, cut_shapes):
+                if cut_shape:
+                    token_list = self._get_image_token_list(cut_shape)
+                else:
+                    token_list = image_token_list
+                input_ids = input_ids[:idx + added_tokens_len] + token_list + input_ids[added_tokens_len + idx + 1:]
+                if labels:
+                    labels = labels[:idx + added_tokens_len] + [-100] * len(token_list) + labels[added_tokens_len + idx
+                                                                                                 + 1:]
+                added_tokens_len += len(token_list) - 1
+            image_token_idx = torch.tensor(_findall(input_ids, image_token_list))[None]
+            _range = torch.arange(len(input_ids))[:, None]
+            matrix = (_range > image_token_idx).sum(dim=1)
+            media_offset = torch.stack([torch.zeros(matrix.shape[0], dtype=torch.long), matrix], dim=-1)[None]
+            inputs['_data'] = {'pixel_values': image_inputs['pixel_values']}
+            inputs['media_offset'] = media_offset
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+    def _post_encode(self, model, data: Any) -> Dict[str, Any]:
+        image_embeds = model.forward_image(data['pixel_values'])
+        return {'image_embeds': image_embeds}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        image_embeds = [b['image_embeds'] for b in batch if 'image_embeds' in b]
+        if image_embeds:
+            res['image_embeds'] = torch.concat(image_embeds)
+        media_offset = [b['media_offset'] for b in batch if 'media_offset' in b]
+        if media_offset:
+            res['media_offset'] = torch.concat(media_offset)
+        return res
+
+
+register_template(TemplateType.mplug_owl3, mPlugOwl3Template(), use_model=False, lazy_tokenize=True)
+
+register_template(TemplateType.wizardlm2_awq,
+                  Template(['{{SYSTEM}}'], ['User:\n{{QUERY}}\n\nAssistant:\n'], ['\n\n'], ['</s>']))
+
+_wizardlm2_system = ('A chat between a curious user and an artificial intelligence assistant. '
+                     'The assistant gives helpful, detailed, and polite answers to the user\'s questions. ')
+register_template(TemplateType.wizardlm2,
+                  Template(['{{SYSTEM}}'], ['USER: {{QUERY}} ASSISTANT:'], ['</s>'], ['</s>'], _wizardlm2_system))
+
+register_template(TemplateType.atom,
+                  Template(['{{SYSTEM}}'], ['<s>Human: {{QUERY}}\n</s><s>Assistant: '], ['</s>'], ['</s>']))
+
+
+class RLHFTemplateMixin:
+
+    def encode(self: Template,
+               example: Dict[str, Any],
+               streaming: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        template_encode = self._old_encode
+        inputs = {}
+        tokenizer_kwargs = {}
+        chosen_example, rejected_example = example, example.copy()
+        rejected_example['response'] = example['rejected_response']
+        if streaming:
+            chosen_inputs, chosen_tokenizer_kwargs = template_encode(chosen_example), {}
+            rejected_inputs, rejected_tokenizer_kwargs = template_encode(rejected_example), {}
+        else:
+            chosen_inputs, chosen_tokenizer_kwargs = template_encode(chosen_example)
+            rejected_inputs, rejected_tokenizer_kwargs = template_encode(rejected_example)
+
+        for suffix, res in zip(['inputs', 'tokenizer_kwargs'], [inputs, tokenizer_kwargs]):
+            for prefix in ['chosen', 'rejected']:
+                data = locals()[f'{prefix}_{suffix}']
+                for k, v in data.items():
+                    res[f'{prefix}_{k}'] = v
+        return inputs, tokenizer_kwargs
+
+    def data_collator(self: Template, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        _data_collator = self._old_data_collator
+        new_batch = []
+        for prefix in ['chosen_', 'rejected_']:
+            for inputs in batch:
+                new_inputs = {}
+                for k, v in inputs.items():
+                    if k.startswith(prefix):
+                        new_k = k[len(prefix):]
+                        new_inputs[new_k] = inputs[k]
+                if len(new_inputs) > 0:
+                    new_batch.append(new_inputs)
+        assert len(new_batch) in {0, len(batch) * 2}, f'new_batch: {new_batch}'
+        return _data_collator(new_batch or batch, padding_to)
+
+
+class KTOTemplateMixin:
+
+    def encode(self: Template,
+               example: Dict[str, Any],
+               streaming: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, tokenizer_kwargs = self._old_encode(example, streaming)
+        if len(inputs) > 0:
+            inputs['label'] = example['label']
+        return inputs, tokenizer_kwargs
+
+    def data_collator(self: Template, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = {}
+        for prefix in ['', 'KL_']:
+            new_batch = []
+            for b in batch:
+                new_batch.append({'input_ids': b[f'{prefix}input_ids'], 'labels': b[f'{prefix}labels']})
+            for k, v in self._old_data_collator(new_batch, padding_to).items():
+                res[f'{prefix}completion_{k}'] = v
+        res['label'] = [b['label'] for b in batch]
+        return res
diff --git a/modelscope/preprocessors/templates/tools_prompt.py b/modelscope/preprocessors/templates/tools_prompt.py
new file mode 100644
index 000000000..35cb73dde
--- /dev/null
+++ b/modelscope/preprocessors/templates/tools_prompt.py
@@ -0,0 +1,107 @@
+from typing import List, Dict, Union, Optional
+
+
+def format_react_en(tool_names, tool_descs):
+    REACT_PROMPT = """Answer the following questions as best as you can. You have access to the following tools:
+
+    {tool_list}
+
+    Use the following format:
+
+    Thought: you should always think about what to do
+    Action: the action to take, should be one of [{tool_names}]
+    Action Input: the input to the action
+    Observation: the result of the action
+    ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+    Final Answer: the final answer to the original input question
+
+    Begin!
+    """
+    return REACT_PROMPT.format(tool_list='\n\n'.join(tool_descs), tool_names=','.join(tool_names))
+
+
+def format_react_zh(tool_names, tool_descs):
+    REACT_ZH_PROMPT = """尽你所能回答以下问题。你拥有如下工具：
+
+    {tool_list}
+
+    使用以下格式回答：
+
+    Thought: 思考你应该做什么
+    Action: 工具的名称，必须是[{tool_names}]之一
+    Action Input: 工具的输入
+    Observation: 工具返回的结果
+    ... (Thought/Action/Action Input/Observation的过程可以重复零次或多次)
+    Final Answer: 对输入问题的最终答案
+
+    开始！
+    """
+    return REACT_ZH_PROMPT.format(tool_list='\n\n'.join(tool_descs), tool_names=','.join(tool_names))
+
+
+def format_glm4(tool_names, tool_descs):
+    GLM4_PROMPT = '''你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。
+
+    # 可用工具
+
+    {tool_list}'''
+    tool_list = ''
+    for name, tool in zip(tool_names, tool_descs):
+        tool_list += f'## {name}\n\n{tool}\n\n'
+    return GLM4_PROMPT.format(tool_list=tool_list)
+
+
+def format_toolbench(tool_names, tool_descs):
+    TOOLBENCH_PROMPT = '''You can use many tools(functions) to do the following task.
+    First I will give you the task description, and your task start.
+    At each step, you need to give your thought to analyze the status now and what to do next, \
+    with a function call to actually excute your step. Your output should follow this format:
+    Thought:
+    Action:
+    Action Input:
+
+    After the call, you will get the call result, and you are now in a new state.
+    Then you will analyze your status now, then decide what to do next...
+    After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.
+    Remember:
+    1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, \
+    say \"I give up and restart\".
+    2.All the thought is short, at most in 5 sentence.
+    3.You can do more then one trys, so if your plan is to continusly try some conditions, \
+    you can do one of the conditions per try.
+    Let's Begin!
+    Task description: You should use functions to help handle the real time user querys. Remember:
+    1.ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information \
+    to show to the user,If you can't handle the task, \
+    or you find that function calls always fail(the function is not valid now), \
+    use function Finish->give_up_and_restart.
+    2.Do not use origin tool names, use only subfunctions' names.
+    Specifically, you have access to the following APIs: {tool_list}'''
+    return TOOLBENCH_PROMPT.format(tool_list='\n\n'.join(tool_descs))
+
+
+tools_prompt = {
+    'react_en': format_react_en,
+    'react_zh': format_react_zh,
+    'glm4': format_glm4,
+    'toolbench': format_toolbench,
+}
+
+
+def get_tools_prompt(TOOLS: List[Dict[str, Union[str, dict]]], prompt_format: str = 'react_en') -> Optional[str]:
+    tool_descs = []
+    tool_names = []
+    for info in TOOLS:  # info: Dict[str, Union[str, dict]]
+        try:
+            if 'function' in info:
+                info = info['function']
+            tool_names.append(info['name'])
+            tool_descs.append(str(info))  # info: dict
+        except KeyError:
+            print('invalid tools format, please check'
+                  'https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md')
+            return None
+    prompt_format = tools_prompt.get(prompt_format) or format_toolbench
+    return prompt_format(tool_names, tool_descs)
+
+
diff --git a/modelscope/preprocessors/templates/utils.py b/modelscope/preprocessors/templates/utils.py
new file mode 100644
index 000000000..a753b8f55
--- /dev/null
+++ b/modelscope/preprocessors/templates/utils.py
@@ -0,0 +1,542 @@
+import base64
+import hashlib
+import math
+import os
+import re
+from collections.abc import Mapping
+from copy import deepcopy
+from io import BytesIO
+from typing import Any, Callable, List, TypeVar, Union, Tuple, Set, Dict, Type, Optional, Sequence
+
+import numpy as np
+import requests
+import torch
+from packaging import version
+
+
+History = List[Union[Tuple[str, str], List[str]]]
+Prompt = List[Union[str, List[int], List[str]]]
+StopWords = Prompt
+Context = Union[str, List[int]]
+Messages = List[Dict[str, Union[str, List[Dict]]]]
+
+
+# >>> internvl
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def split_str_parts_by(text: str, delimiters: List[str]):
+    """Split the text field into parts.
+
+    Args:
+        text: A text to be split.
+        delimiters: The delimiters.
+
+    Returns:
+        The split text in list of dicts.
+    """
+    assert isinstance(text, str), f'text: {text}'
+    all_start_chars = [d[0] for d in delimiters]
+    all_length = [len(d) for d in delimiters]
+
+    text_list = []
+    last_words = ''
+
+    while len(text) > 0:
+        for char_idx, char in enumerate(text):
+            match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
+            is_delimiter = False
+            for index in match_index:
+                if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
+                    if text_list:
+                        text_list[-1]['content'] = last_words
+                    elif last_words:
+                        text_list.append({'key': '', 'content': last_words})
+                    last_words = ''
+                    text_list.append({'key': delimiters[index]})
+                    text = text[char_idx + all_length[index]:]
+                    is_delimiter = True
+                    break
+            if not is_delimiter:
+                last_words += char
+            else:
+                break
+        if last_words == text:
+            text = ''
+
+    if len(text_list):
+        text_list[-1]['content'] = last_words
+    else:
+        text_list.append({'key': '', 'content': last_words})
+    return text_list
+
+
+def split_parts_by_regex(text_list: list, regex_delimiters: Dict[str, List[float]]) -> None:
+    import re
+    compiled_patterns = [(re.compile(pattern), scale) for pattern, scale in regex_delimiters.items()]
+    for i in range(len(text_list) - 1, -1, -1):
+        item = text_list[i]
+        if item.get('key') == '':
+            res_text = item['content']
+            last_idx = 0
+            segments = []
+
+            for pattern, scale in compiled_patterns:
+                matches = list(re.finditer(pattern, res_text))
+                for match in matches:
+                    if match.start() > last_idx:
+                        segments.append({'key': '', 'content': res_text[last_idx:match.start()]})
+                    segments.append({'key': scale[0], 'content': match.group(0)})
+                    last_idx = match.end()
+
+            if last_idx < len(res_text):
+                segments.insert(0, {'key': '', 'content': res_text[last_idx:]})
+
+            if segments:
+                text_list[i:i + 1] = segments
+
+
+def _decode_prompt(prompt: str, tmp_dir: str = 'tmp') -> str:
+    pattern = r'<(?:img|audio|video)>(.+?)</(?:img|audio|video)>'
+    match_iter = re.finditer(pattern, prompt)
+    new_content = ''
+    idx = 0
+    for m in match_iter:
+        span = m.span(1)
+        img_base64 = m.group(1)
+        img_path = _from_base64(img_base64, tmp_dir)
+        new_content += prompt[idx:span[0]] + img_path
+        idx = span[1]
+    new_content += prompt[idx:]
+    return new_content
+
+
+def _to_base64(img_path: Union[str, 'PIL.Image.Image', bytes]) -> str:
+    if isinstance(img_path, str) and not os.path.isfile(img_path):
+        # base64
+        return img_path
+    if isinstance(img_path, str):
+        # local_path
+        with open(img_path, 'rb') as f:
+            _bytes = f.read()
+    elif not isinstance(img_path, bytes):  # PIL.Image.Image
+        bytes_io = BytesIO()
+        img_path.save(bytes_io, format='png')
+        _bytes = bytes_io.getvalue()
+    else:
+        _bytes = img_path
+    img_base64: str = base64.b64encode(_bytes).decode('utf-8')
+    return img_base64
+
+
+def _from_base64(img_base64: Union[str, 'PIL.Image.Image'], tmp_dir: str = 'tmp') -> str:
+    from PIL import Image
+    if not isinstance(img_base64, str):  # PIL.Image.Image
+        img_base64 = _to_base64(img_base64)
+    if os.path.isfile(img_base64) or img_base64.startswith('http'):
+        return img_base64
+    sha256_hash = hashlib.sha256(img_base64.encode('utf-8')).hexdigest()
+    img_path = os.path.join(tmp_dir, f'{sha256_hash}.png')
+    image = Image.open(BytesIO(base64.b64decode(img_base64)))
+    if not os.path.exists(img_path):
+        image.save(img_path)
+    return img_path
+
+
+def decode_base64(*,
+                  messages: Optional[Messages] = None,
+                  prompt: Optional[str] = None,
+                  images: Optional[List[str]] = None,
+                  tmp_dir: str = 'tmp') -> Dict[str, Any]:
+    # base64 -> local_path
+    os.makedirs(tmp_dir, exist_ok=True)
+    res = {}
+    if messages is not None:
+        res_messages = []
+        for m in messages:
+            m_new = deepcopy(m)
+            m_new['content'] = _decode_prompt(m_new['content'], tmp_dir)
+            res_messages.append(m_new)
+        res['messages'] = res_messages
+    if prompt is not None:
+        prompt = _decode_prompt(prompt, tmp_dir)
+        res['prompt'] = prompt
+    if images is not None:
+        res_images = []
+        for image in images:
+            image = _from_base64(image, tmp_dir)
+            res_images.append(image)
+        res['images'] = res_images
+    return res
+
+
+def to_device(inputs: Any, device: torch.device) -> Any:
+    """Move inputs to a device"""
+    if callable(getattr(inputs, 'to', None)):
+        return inputs.to(device=device)
+
+    if isinstance(inputs, Mapping):
+        res = {}
+        for k, v in inputs.items():
+            res[k] = to_device(v, device)
+    elif isinstance(inputs, Sequence) and not isinstance(inputs, str):
+        res = []
+        for b in inputs:
+            res.append(to_device(b, device))
+    else:
+        res = inputs
+    return res
+
+
+def upper_bound(lo: int, hi: int, cond: Callable[[int], bool]) -> int:
+    # The upper bound satisfying the condition "cond".
+    while lo < hi:
+        mid = (lo + hi + 1) >> 1  # lo + (hi-lo+1)>>1
+        if cond(mid):
+            lo = mid
+        else:
+            hi = mid - 1
+    return lo
+
+
+def fetch_one(element: Union[Tuple, List, Set, Dict, Any], type: Type = None) -> Any:
+    if isinstance(element, (tuple, set, list)):
+        for ele in element:
+            out = fetch_one(ele)
+            if out and (type is None or isinstance(out, type)):
+                return out
+    elif isinstance(element, dict):
+        return fetch_one(list(element.values()))
+    else:
+        return element
+
+
+def _build_transform(input_size):
+    import torchvision.transforms as T
+    from torchvision.transforms.functional import InterpolationMode
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def _find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def _dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = _find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size, ((i //
+                                                                        (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# <<< internvl
+
+
+def rescale_image(img: 'PIL.Image.Image', rescale_image: int = -1) -> 'PIL.Image.Image':
+    import torchvision.transforms as T
+    width = img.width
+    height = img.height
+    if rescale_image <= 0 or width * height <= rescale_image:
+        return img
+
+    ratio = width / height
+    height_scaled = math.pow(rescale_image / ratio, 0.5)
+    width_scaled = height_scaled * ratio
+    return T.Resize((int(width_scaled), int(height_scaled)))(img)
+
+
+_T = TypeVar('_T')
+
+
+def load_file(path: Union[str, _T]) -> Union[BytesIO, _T]:
+    res = path
+    if isinstance(path, str):
+        path = path.strip()
+        if path.startswith('http'):
+            request_kwargs = {}
+            timeout = float(os.getenv('TIMEOUT', '60'))
+            if timeout > 0:
+                request_kwargs['timeout'] = timeout
+            content = requests.get(path, **request_kwargs).content
+            res = BytesIO(content)
+        elif os.path.exists(path):
+            with open(path, 'rb') as f:
+                res = BytesIO(f.read())
+        else:  # base64_str
+            import binascii
+            try:
+                data = base64.b64decode(path)
+                res = BytesIO(data)
+            except (ValueError, binascii.Error) as error:
+                if len(path) < 200:
+                    raise ValueError(f'invalid image: "{path}"')
+                else:
+                    raise ValueError(f'invalid image: {error}')
+    return res
+
+
+def load_file_decorator(func):
+
+    def new_func(path, *args, **kwargs):
+        path = load_file(path)
+        res = func(path, *args, **kwargs)
+        return res
+
+    return new_func
+
+
+@load_file_decorator
+def load_image(image: Union['PIL.Image.Image', BytesIO]) -> 'PIL.Image.Image':
+    from PIL import Image
+    if isinstance(image, BytesIO):
+        image = Image.open(image)
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    return image
+
+
+def load_batch(path_list: List[Union[str, None, Any, BytesIO]],
+               load_func: Callable[[Any], _T] = load_image) -> List[_T]:
+    res = []
+    assert isinstance(path_list, (list, tuple)), f'path_list: {path_list}'
+    for path in path_list:
+        if path is None:  # ignore None
+            continue
+        res.append(load_func(path))
+    return res
+
+
+def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array(
+        [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
+    return frame_indices
+
+
+def transform_image(image, input_size=448, max_num=12):
+    transform = _build_transform(input_size=input_size)
+    images = _dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+@load_file_decorator
+def load_video_internvl(video_io: BytesIO, bound=None, num_segments=32):
+    from decord import VideoReader, cpu
+    from PIL import Image
+    vr = VideoReader(video_io, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+
+    images = []
+    frame_indices = _get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        images.append(Image.fromarray(vr[frame_index].asnumpy()).convert('RGB'))
+    return images
+
+
+def draw_plot(img_dir: str, bbox: List[int], bbox_type: str, output_file: str):
+    from PIL import Image, ImageDraw
+    from swift.llm.template.template import Template
+    image = Image.open(img_dir)
+
+    objects = [{'bbox': bbox, 'bbox_type': bbox_type, 'image': 0}]
+    Template.normalize_bbox(objects, [image], 'real')
+    bbox = objects[0]['bbox']
+    draw = ImageDraw.Draw(image)
+    draw.rectangle(bbox, outline='red', width=2)
+    image.save(output_file)
+
+
+@load_file_decorator
+def load_video_cogvlm2(video_io: BytesIO) -> np.ndarray:
+    from decord import cpu, VideoReader, bridge
+    bridge.set_bridge('torch')
+    clip_end_sec = 60
+    clip_start_sec = 0
+    num_frames = 24
+    decord_vr = VideoReader(video_io, ctx=cpu(0))
+    duration = len(decord_vr)  # duration in terms of frames
+    start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
+    end_frame = min(duration, int(clip_end_sec * decord_vr.get_avg_fps())) if \
+        clip_end_sec is not None else duration
+    frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
+    video_data = decord_vr.get_batch(frame_id_list)
+    video_data = video_data.permute(3, 0, 1, 2)
+    return video_data
+
+
+@load_file_decorator
+def load_video_llava(video_io: BytesIO) -> np.ndarray:
+    import av
+    container = av.open(video_io)
+    total_frames = container.streams.video[0].frames
+    indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format='rgb24') for x in frames])
+
+
+@load_file_decorator
+def load_video_minicpmv_mplug_owl3(video_io: BytesIO, max_num_frames):
+    from PIL import Image
+    from decord import VideoReader, cpu  # pip install decord
+
+    def uniform_sample(_l, _n):
+        gap = len(_l) / _n
+        idxs = [int(i * gap + gap / 2) for i in range(_n)]
+        return [_l[i] for i in idxs]
+
+    vr = VideoReader(video_io, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+
+    if len(frame_idx) > max_num_frames:
+        frame_idx = uniform_sample(frame_idx, max_num_frames)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    return frames
+
+
+@load_file_decorator
+def load_audio_qwen(audio_io: BytesIO, sampling_rate: int):
+    import librosa
+    return librosa.load(audio_io, sr=sampling_rate)[0]
+
+
+def load_video_qwen2(video_path: str):
+    from swift.llm.template.template import get_env_args
+    import torchvision
+    from torchvision import io, transforms
+    from qwen_vl_utils.vision_process import (round_by_factor, FPS, FRAME_FACTOR, FPS_MIN_FRAMES, FPS_MAX_FRAMES,
+                                              VIDEO_MIN_PIXELS, VIDEO_MAX_PIXELS, VIDEO_TOTAL_PIXELS, smart_resize,
+                                              ceil_by_factor, floor_by_factor)
+    from torchvision.transforms import InterpolationMode
+
+    if version.parse(torchvision.__version__) >= version.parse('0.19'):
+        video_path = load_file(video_path)
+    video, _, info = io.read_video(
+        video_path,
+        pts_unit='sec',
+        output_format='TCHW',
+    )
+    nframes = get_env_args('nframes', int, None)
+    fps = get_env_args('fps', int, None)
+    size_factor = get_env_args('size_factor', int, FRAME_FACTOR)
+    assert not (fps and nframes), 'Only accept either `fps` or `nframes`'
+    if nframes is not None:
+        nframes = round_by_factor(nframes, size_factor)
+    else:
+        fps = FPS
+        nframes = video.size(0) / info['video_fps'] * fps
+        nframes = round_by_factor(nframes, size_factor)
+        min_frames = get_env_args('min_frames', int, FPS_MIN_FRAMES)
+        max_frames = get_env_args('max_frames', int, FPS_MAX_FRAMES)
+        if nframes < min_frames:
+            nframes = ceil_by_factor(min_frames, size_factor)
+        if nframes > max_frames:
+            nframes = floor_by_factor(max_frames, size_factor)
+
+    if not (size_factor <= nframes and nframes <= video.size(0)):
+        raise ValueError(f'nframes should in interval [{size_factor}, {video.size(0)}], but got {nframes}.')
+
+    idx = torch.linspace(0, video.size(0) - 1, nframes).round().long()
+    height, width = video.shape[2:]
+    video = video[idx]
+
+    min_pixels = get_env_args('min_pixels', int, VIDEO_MIN_PIXELS)
+    total_pixels = get_env_args('total_pixels', int, VIDEO_TOTAL_PIXELS)
+    max_pixels = get_env_args('max_pixels', int, None)
+    if max_pixels is None:
+        max_pixels = VIDEO_MAX_PIXELS
+        max_pixels = max(min(max_pixels, total_pixels / nframes * size_factor), min_pixels * 1.05)
+    # resize
+    resized_height = get_env_args('resized_height', int, None)
+    resized_width = get_env_args('resized_width', int, None)
+    if resized_height and resized_width:
+        resized_height, resized_width = smart_resize(
+            resized_height,
+            resized_width,
+            factor=size_factor,
+        )
+    else:
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+    video = transforms.functional.resize(
+        video,
+        [resized_height, resized_width],
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True,
+    ).float()
+    return video
+
+
+if __name__ == '__main__':
+    # A test main to draw bbox
+    draw_plot('man.jpg', [354, 462, 580, 738], 'norm_1000', 'man_bbox.jpg')
diff --git a/modelscope/server/__init__.py b/modelscope/server/__init__.py
new file mode 100644
index 000000000..587aa148d
--- /dev/null
+++ b/modelscope/server/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import TYPE_CHECKING
+
+from modelscope.utils.import_utils import LazyImportModule
+
+if TYPE_CHECKING:
+    from .api_server import run_server, add_server_args
+else:
+    _import_structure = {'api_server': ['run_server', 'add_server_arg']}
+    import sys
+    sys.modules[__name__] = LazyImportModule(
+        __name__,
+        globals()['__file__'],
+        _import_structure,
+        module_spec=__spec__,
+        extra_objects={},
+    )
diff --git a/modelscope/server/api/__init__.py b/modelscope/server/api/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/server/api/routers/__init__.py b/modelscope/server/api/routers/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/server/api/routers/health.py b/modelscope/server/api/routers/health.py
new file mode 100644
index 000000000..2d88c58ca
--- /dev/null
+++ b/modelscope/server/api/routers/health.py
@@ -0,0 +1,14 @@
+from faulthandler import disable
+from http import HTTPStatus
+from typing import Any, Dict
+
+from fastapi import APIRouter
+
+from modelscope.server.models.output import ApiResponse
+
+router = APIRouter()
+
+
+@router.get('', response_model=ApiResponse[Dict], status_code=200)
+def health() -> Any:
+    return ApiResponse[Dict](Data={}, Code=HTTPStatus.OK, Success=True)
diff --git a/modelscope/server/api/routers/model_router.py b/modelscope/server/api/routers/model_router.py
new file mode 100644
index 000000000..8d3a33f20
--- /dev/null
+++ b/modelscope/server/api/routers/model_router.py
@@ -0,0 +1,45 @@
+from fastapi import APIRouter, Body
+from pydantic import BaseModel
+from starlette.requests import Request
+
+from modelscope.utils.input_output import \
+    pipeline_output_to_service_base64_output  # noqa E125
+from modelscope.utils.input_output import call_pipeline_with_json
+
+router = APIRouter()
+
+
+@router.post('/call')
+async def inference(
+    request: Request,
+    body: BaseModel = Body(examples=[{
+        'usage': 'copy body from describe'
+    }])):  # noqa E125
+    """Inference general interface.
+
+    For image, video, audio etc binary data, need encoded with base64.
+
+    Args:
+        request (Request): The request object.
+        request_info (ModelScopeRequest): The post body.
+
+    Returns:
+        ApiResponse: For binary field, encoded with base64
+    """
+    pipeline_service = request.app.state.pipeline
+    pipeline_info = request.app.state.pipeline_info
+    request_json = await request.json()
+    result = call_pipeline_with_json(pipeline_info, pipeline_service,
+                                     request_json)
+    # convert output to json, if binary field, we need encoded.
+    output = pipeline_output_to_service_base64_output(
+        pipeline_info['task_name'], result)
+    return output
+
+
+@router.get('/describe')
+async def describe(request: Request):
+    info = {}
+    info['schema'] = request.app.state.pipeline_info
+    info['sample'] = request.app.state.pipeline_sample
+    return info
diff --git a/modelscope/server/api/routers/router.py b/modelscope/server/api/routers/router.py
new file mode 100644
index 000000000..3f35edf15
--- /dev/null
+++ b/modelscope/server/api/routers/router.py
@@ -0,0 +1,7 @@
+from fastapi import APIRouter
+
+from modelscope.server.api.routers import health, model_router
+
+api_router = APIRouter()
+api_router.include_router(model_router.router, tags=['prediction'], prefix='')
+api_router.include_router(health.router, tags=['health'], prefix='/health')
diff --git a/modelscope/server/api_server.py b/modelscope/server/api_server.py
new file mode 100644
index 000000000..95dcd76fc
--- /dev/null
+++ b/modelscope/server/api_server.py
@@ -0,0 +1,59 @@
+import argparse
+
+
+def add_server_args(parser: argparse.ArgumentParser):
+    parser.add_argument(
+        '--model_id', required=True, type=str, help='The target model id')
+    parser.add_argument(
+        '--revision', required=True, type=str, help='Model revision')
+    parser.add_argument('--host', default='0.0.0.0', help='Host to listen')
+    parser.add_argument('--port', type=int, default=8000, help='Server port')
+    parser.add_argument('--debug', default='debug', help='Set debug level.')
+    parser.add_argument(
+        '--external_engine_for_llm',
+        type=bool,
+        default=True,
+        help='Use LLMPipeline first for llm models.')
+
+
+def run_server(args):
+    try:
+        import uvicorn
+        app = get_app(args)
+        uvicorn.run(app, host=args.host, port=args.port)
+    except ModuleNotFoundError as e:
+        print(e)
+        print(
+            'To execute the server command, first '
+            'install the domain dependencies with: '
+            'pip install modelscope[DOMAIN] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html '
+            'the "DOMAIN" include [cv|nlp|audio|multi-modal|science] '
+            'and then install server dependencies with: pip install modelscope[server]'
+        )
+
+
+def get_app(args):
+    from fastapi import FastAPI
+    from modelscope.server.api.routers.router import api_router
+    from modelscope.server.core.event_handlers import (start_app_handler,
+                                                       stop_app_handler)
+    app = FastAPI(
+        title='modelscope_server',
+        version='0.1',
+        debug=True,
+        swagger_ui_parameters={'tryItOutEnabled': True})
+    app.state.args = args
+    app.include_router(api_router)
+
+    app.add_event_handler('startup', start_app_handler(app))
+    app.add_event_handler('shutdown', stop_app_handler(app))
+    return app
+
+
+if __name__ == '__main__':
+    import uvicorn
+    parser = argparse.ArgumentParser('modelscope_server')
+    add_server_args(parser)
+    args = parser.parse_args()
+    app = get_app(args)
+    uvicorn.run(app, host=args.host, port=args.port)
diff --git a/modelscope/server/core/__init__.py b/modelscope/server/core/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/server/core/event_handlers.py b/modelscope/server/core/event_handlers.py
new file mode 100644
index 000000000..9557e7182
--- /dev/null
+++ b/modelscope/server/core/event_handlers.py
@@ -0,0 +1,47 @@
+from typing import Callable
+
+from fastapi import FastAPI
+
+from modelscope.utils.input_output import (  # yapf: disable
+    create_pipeline, get_pipeline_information_by_pipeline,
+    get_task_input_examples, get_task_schemas)
+from modelscope.utils.logger import get_logger
+
+# control the model start stop
+
+logger = get_logger()
+
+
+def _startup_model(app: FastAPI) -> None:
+    logger.info('download model and create pipeline')
+    app.state.pipeline = create_pipeline(
+        app.state.args.model_id, app.state.args.revision,
+        app.state.args.external_engine_for_llm)
+    info = {}
+    info['task_name'] = app.state.pipeline.group_key
+    info['schema'] = get_task_schemas(app.state.pipeline.group_key)
+    app.state.pipeline_info = info
+    app.state.pipeline_sample = get_task_input_examples(
+        app.state.pipeline.group_key)
+    logger.info('pipeline created.')
+
+
+def _shutdown_model(app: FastAPI) -> None:
+    app.state.pipeline = None
+    logger.info('shutdown model service')
+
+
+def start_app_handler(app: FastAPI) -> Callable:
+
+    def startup() -> None:
+        _startup_model(app)
+
+    return startup
+
+
+def stop_app_handler(app: FastAPI) -> Callable:
+
+    def shutdown() -> None:
+        _shutdown_model(app)
+
+    return shutdown
diff --git a/modelscope/server/models/__init__.py b/modelscope/server/models/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/modelscope/server/models/input.py b/modelscope/server/models/input.py
new file mode 100644
index 000000000..08ff98516
--- /dev/null
+++ b/modelscope/server/models/input.py
@@ -0,0 +1,8 @@
+from pydantic import BaseModel
+
+
+class ModelScopeRequest(BaseModel):
+
+    def __init__(self, input: object, parameters: object):
+        self.input = input
+        self.parameters = parameters
diff --git a/modelscope/server/models/output.py b/modelscope/server/models/output.py
new file mode 100644
index 000000000..fa86f49aa
--- /dev/null
+++ b/modelscope/server/models/output.py
@@ -0,0 +1,34 @@
+import datetime
+from http import HTTPStatus
+from typing import Generic, Optional, Type, TypeVar
+
+import json
+from pydantic import BaseModel
+
+ResultType = TypeVar('ResultType')
+
+
+class ApiResponse(BaseModel, Generic[ResultType]):
+    Code: Optional[int] = HTTPStatus.OK
+    Success: Optional[bool] = True
+    RequestId: Optional[str] = ''
+    Message: Optional[str] = 'success'
+    Data: Optional[ResultType] = {}
+    """
+        ResultType (_type_): The response data type.
+        Failed: {'Code': 10010101004, 'Message': 'get model info failed, err: unauthorized permission',
+                 'RequestId': '', 'Success': False}
+        Success: {'Code': 200, 'Data': {}, 'Message': 'success', 'RequestId': '', 'Success': True}
+
+
+
+    def set_data(self, data=Type[ResultType]):
+        self.Data = data
+
+    def set_message(self, message):
+        self.Message = message
+
+    def toJSON(self):
+        return json.dumps(self, default=lambda o: o.isoformat() if (isinstance(o, datetime.datetime))
+                          else o.__dict__, sort_keys=True, indent=4)
+    """
diff --git a/modelscope/trainers/__init__.py b/modelscope/trainers/__init__.py
index 0d20fe00e..1feeb6998 100644
--- a/modelscope/trainers/__init__.py
+++ b/modelscope/trainers/__init__.py
@@ -36,7 +36,7 @@
         'nlp_trainer': ['NlpEpochBasedTrainer', 'VecoTrainer'],
         'trainer': ['EpochBasedTrainer'],
         'training_args': ['TrainingArgs', 'build_dataset_from_file'],
-        'hooks': ['Hook']
+        'hooks': ['Hook', 'Priority']
     }
 
     import sys
diff --git a/modelscope/utils/__init__.py b/modelscope/utils/__init__.py
index 7486e137b..e69de29bb 100644
--- a/modelscope/utils/__init__.py
+++ b/modelscope/utils/__init__.py
@@ -1 +0,0 @@
-from .hub import create_model_if_not_exist, read_config
diff --git a/modelscope/utils/ast_utils.py b/modelscope/utils/ast_utils.py
index 1aca1ce1a..657ebb330 100644
--- a/modelscope/utils/ast_utils.py
+++ b/modelscope/utils/ast_utils.py
@@ -2,34 +2,33 @@
 
 import ast
 import hashlib
+import logging
 import os
 import os.path as osp
 import time
 import traceback
+from datetime import datetime
 from functools import reduce
 from pathlib import Path
 from typing import Union
 
-import gast
 import json
 
-from modelscope.fileio.file import LocalStorage
+from modelscope import version
+# do not delete
 from modelscope.metainfo import (CustomDatasets, Heads, Hooks, LR_Schedulers,
                                  Metrics, Models, Optimizers, Pipelines,
                                  Preprocessors, TaskModels, Trainers)
 from modelscope.utils.constant import Fields, Tasks
-from modelscope.utils.file_utils import get_default_cache_dir
-from modelscope.utils.logger import get_logger
+from modelscope.utils.file_utils import get_modelscope_cache_dir
 from modelscope.utils.registry import default_group
 
-logger = get_logger()
-storage = LocalStorage()
 p = Path(__file__)
 
 # get the path of package 'modelscope'
 SKIP_FUNCTION_SCANNING = True
 MODELSCOPE_PATH = p.resolve().parents[1]
-INDEXER_FILE_DIR = get_default_cache_dir()
+INDEXER_FILE_DIR = get_modelscope_cache_dir()
 REGISTER_MODULE = 'register_module'
 IGNORED_PACKAGES = ['modelscope', '.']
 SCAN_SUB_FOLDERS = [
@@ -57,6 +56,15 @@
 TEMPLATE_FILE = 'ast_index_file.py'
 
 
+def get_ast_logger():
+    ast_logger = logging.getLogger('modelscope.ast')
+    ast_logger.setLevel(logging.INFO)
+    return ast_logger
+
+
+logger = get_ast_logger()
+
+
 class AstScanning(object):
 
     def __init__(self) -> None:
@@ -360,8 +368,7 @@ def generate_ast(self, file):
         with open(file, 'r', encoding='utf8') as code:
             data = code.readlines()
         data = ''.join(data)
-
-        node = gast.parse(data)
+        node = ast.parse(data)
         output = self.scan_import(node, show_offsets=False)
         output[DECORATOR_KEY] = self.parse_decorators(output[DECORATOR_KEY])
         output[EXPRESS_KEY] = self.parse_decorators(output[EXPRESS_KEY])
@@ -573,6 +580,25 @@ def files_mtime_md5(self,
 file_scanner = FilesAstScanning()
 
 
+def ensure_write(obj: bytes, filepath: Union[str, Path]) -> None:
+    """Write data to a given ``filepath`` with 'wb' mode.
+
+    Note:
+        ``write`` will create a directory if the directory of ``filepath``
+        does not exist.
+
+    Args:
+        obj (bytes): Data to be written.
+        filepath (str or Path): Path to write data.
+    """
+    dirname = os.path.dirname(filepath)
+    if dirname and not os.path.exists(dirname):
+        os.makedirs(dirname, exist_ok=True)
+
+    with open(filepath, 'wb') as f:
+        f.write(obj)
+
+
 def _save_index(index, file_path, file_list=None, with_template=False):
     # convert tuple key to str key
     index[INDEX_KEY] = {str(k): v for k, v in index[INDEX_KEY].items()}
@@ -585,7 +611,7 @@ def _save_index(index, file_path, file_list=None, with_template=False):
     if with_template:
         json_index = json_index.replace(MODELSCOPE_PATH.as_posix(),
                                         TEMPLATE_PATH)
-    storage.write(json_index.encode(), file_path)
+    ensure_write(json_index.encode(), file_path)
     index[INDEX_KEY] = {
         ast.literal_eval(k): v
         for k, v in index[INDEX_KEY].items()
@@ -593,7 +619,8 @@ def _save_index(index, file_path, file_list=None, with_template=False):
 
 
 def _load_index(file_path, with_template=False):
-    bytes_index = storage.read(file_path)
+    with open(file_path, 'rb') as f:
+        bytes_index = f.read()
     if with_template:
         bytes_index = bytes_index.decode().replace(TEMPLATE_PATH,
                                                    MODELSCOPE_PATH.as_posix())
@@ -640,6 +667,19 @@ def _update_index(index, files_mtime):
     index[REQUIREMENT_KEY].update(updated_index[REQUIREMENT_KEY])
 
 
+def __is_develop_model():
+    # use the trick of release time check is in development
+    release_timestamp = int(
+        round(
+            datetime.strptime(version.__release_datetime__,
+                              '%Y-%m-%d %H:%M:%S').timestamp()))
+    SECONDS_PER_YEAR = 24 * 365 * 60 * 60
+    current_timestamp = int(round(datetime.now().timestamp()))
+    if release_timestamp > current_timestamp + SECONDS_PER_YEAR:
+        return True
+    return False
+
+
 def load_index(
     file_list=None,
     force_rebuild=False,
@@ -679,53 +719,45 @@ def load_index(
     cache_dir = os.getenv('MODELSCOPE_CACHE', indexer_file_dir)
     index_file = os.getenv('MODELSCOPE_INDEX_FILE', indexer_file)
     file_path = os.path.join(cache_dir, index_file)
-    logger.info(f'Loading ast index from {file_path}')
     index = None
-    local_changed = False
-    if not force_rebuild and os.path.exists(file_path):
-        wrapped_index = _load_index(file_path)
-        md5, files_mtime = file_scanner.files_mtime_md5(file_list=file_list)
-        from modelscope.version import __version__
-        if (wrapped_index[VERSION_KEY] == __version__):
+
+    if force_rebuild:
+        logger.info('Force rebuilding ast index from scanning every file!')
+        index = file_scanner.get_files_scan_results(file_list)
+        return index
+
+    # when developing, we need to generator as need.
+    if __is_develop_model():
+        logger.info(f'Loading ast index from {file_path}')
+        if os.path.exists(file_path):  # already exist, check it's latest
+            wrapped_index = _load_index(file_path)
+            md5, files_mtime = file_scanner.files_mtime_md5(
+                file_list=file_list)
             index = wrapped_index
-            if (wrapped_index[MD5_KEY] != md5):
-                local_changed = True
-    full_index_flag = False
-
-    if index is None:
-        full_index_flag = True
-    elif index and local_changed and FILES_MTIME_KEY not in index:
-        full_index_flag = True
-    elif index and local_changed and MODELSCOPE_PATH_KEY not in index:
-        full_index_flag = True
-    elif index and local_changed and index[
-            MODELSCOPE_PATH_KEY] != MODELSCOPE_PATH.as_posix():
-        full_index_flag = True
-
-    if full_index_flag:
-        if force_rebuild:
-            logger.info('Force rebuilding ast index from scanning every file!')
-            index = file_scanner.get_files_scan_results(file_list)
+            from modelscope.version import __version__
+            if (wrapped_index[VERSION_KEY] == __version__
+                    and wrapped_index[MD5_KEY] != md5) or \
+                    wrapped_index[VERSION_KEY] != __version__:
+                logger.info(
+                    'Updating the files for the changes of local files, '
+                    'first time updating will take longer time! Please wait till updating done!'
+                )
+                _update_index(index, files_mtime)
+                _save_index(index, file_path, file_list)
         else:
             logger.info(
-                f'No valid ast index found from {file_path}, generating ast index from prebuilt!'
+                f'No valid ast index found from {file_path}, generating ast index from scratch!'
             )
-            index = load_from_prebuilt()
-            if index is None:
-                index = file_scanner.get_files_scan_results(file_list)
-        _save_index(index, file_path, file_list)
-    elif local_changed and not full_index_flag:
+            index = file_scanner.get_files_scan_results(
+                file_list)  # generate new
+            _save_index(index, file_path, file_list)  # save to generate path.
         logger.info(
-            'Updating the files for the changes of local files, '
-            'first time updating will take longer time! Please wait till updating done!'
-        )
-        _update_index(index, files_mtime)
-        _save_index(index, file_path, file_list)
+            f'Loading done! Current index file version is {index[VERSION_KEY]}, '
+            f'with md5 {index[MD5_KEY]} and a total number of '
+            f'{len(index[INDEX_KEY])} components indexed')
+    else:  # just load the prebuild index file.
+        index = load_from_prebuilt()
 
-    logger.info(
-        f'Loading done! Current index file version is {index[VERSION_KEY]}, '
-        f'with md5 {index[MD5_KEY]} and a total number of '
-        f'{len(index[INDEX_KEY])} components indexed')
     return index
 
 
diff --git a/modelscope/utils/audio/audio_utils.py b/modelscope/utils/audio/audio_utils.py
index 562769b85..5b53bf6c9 100644
--- a/modelscope/utils/audio/audio_utils.py
+++ b/modelscope/utils/audio/audio_utils.py
@@ -1,7 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
 import re
-import shutil
 import struct
 import sys
 import tempfile
@@ -11,7 +10,7 @@
 import numpy as np
 
 from modelscope.fileio.file import HTTPStorage
-from modelscope.hub.utils.utils import get_cache_dir
+from modelscope.utils.file_utils import get_model_cache_root
 from modelscope.utils.hub import snapshot_download
 from modelscope.utils.logger import get_logger
 
@@ -334,7 +333,7 @@ def update_local_model(model_config, model_path, extra_args):
             model_revision = extra_args['update_model']
     if model_config.__contains__('model'):
         model_name = model_config['model']
-        dst_dir_root = get_cache_dir()
+        dst_dir_root = get_model_cache_root()
         if isinstance(model_path, str) and os.path.exists(
                 model_path) and not model_path.startswith(dst_dir_root):
             try:
diff --git a/modelscope/utils/automodel_utils.py b/modelscope/utils/automodel_utils.py
index f96046ff3..eb4aa6c86 100644
--- a/modelscope/utils/automodel_utils.py
+++ b/modelscope/utils/automodel_utils.py
@@ -3,11 +3,10 @@
 from types import MethodType
 from typing import Any, Optional
 
-from packaging import version
-
 from modelscope.metainfo import Tasks
 from modelscope.utils.ast_utils import INDEX_KEY
 from modelscope.utils.import_utils import (LazyImportModule,
+                                           is_torch_available,
                                            is_transformers_available)
 
 
@@ -16,7 +15,7 @@ def can_load_by_ms(model_dir: str, task_name: Optional[str],
     if model_type is None or task_name is None:
         return False
     if ('MODELS', task_name,
-            model_type) in LazyImportModule.AST_INDEX[INDEX_KEY]:
+            model_type) in LazyImportModule.get_ast_index()[INDEX_KEY]:
         return True
     ms_wrapper_path = os.path.join(model_dir, 'ms_wrapper.py')
     if os.path.exists(ms_wrapper_path):
@@ -27,7 +26,9 @@ def can_load_by_ms(model_dir: str, task_name: Optional[str],
 def fix_upgrade(module_obj: Any):
     from transformers import PreTrainedModel
     if hasattr(module_obj, '_set_gradient_checkpointing') \
-            and 'value' in inspect.signature(module_obj._set_gradient_checkpointing).parameters.keys():
+            and 'value' in inspect.signature(
+                module_obj._set_gradient_checkpointing).parameters.keys() \
+            and 'modelscope.' in str(module_obj.__class__):
         module_obj._set_gradient_checkpointing = MethodType(
             PreTrainedModel._set_gradient_checkpointing, module_obj)
 
@@ -38,10 +39,11 @@ def post_init(self, *args, **kwargs):
 
 
 def fix_transformers_upgrade():
-    if is_transformers_available():
+    if is_transformers_available() and is_torch_available():
         # from 4.35.0, transformers changes its arguments of _set_gradient_checkpointing
         import transformers
         from transformers import PreTrainedModel
+        from packaging import version
         if version.parse(transformers.__version__) >= version.parse('4.35.0') \
                 and not hasattr(PreTrainedModel, 'post_init_origin'):
             PreTrainedModel.post_init_origin = PreTrainedModel.post_init
diff --git a/modelscope/utils/config.py b/modelscope/utils/config.py
index 85cb8b778..099dbf11c 100644
--- a/modelscope/utils/config.py
+++ b/modelscope/utils/config.py
@@ -16,10 +16,8 @@
 
 import addict
 import json
-from yapf.yapflib.yapf_api import FormatCode
 
 from modelscope.utils.constant import ConfigFields, ModelFile
-from modelscope.utils.import_utils import import_modules_from_file
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -101,6 +99,8 @@ def _file2dict(filename):
             shutil.copyfile(filename, tmp_cfg_file.name)
 
             if filename.endswith('.py'):
+                # import as needed.
+                from modelscope.utils.import_utils import import_modules_from_file
                 module_nanme, mod = import_modules_from_file(
                     osp.join(tmp_cfg_dir, tmp_cfg_name))
                 cfg_dict = {}
@@ -282,6 +282,7 @@ def _format_dict(input_dict, outest_level=False):
             based_on_style='pep8',
             blank_line_before_nested_class_or_def=True,
             split_before_expression_after_opening_paren=True)
+        from yapf.yapflib.yapf_api import FormatCode
         text, _ = FormatCode(text, style_config=yapf_style, verify=True)
 
         return text
diff --git a/modelscope/utils/config_ds.py b/modelscope/utils/config_ds.py
index fce823c44..72a25887e 100644
--- a/modelscope/utils/config_ds.py
+++ b/modelscope/utils/config_ds.py
@@ -5,14 +5,11 @@
 
 # Cache location
 from modelscope.hub.constants import DEFAULT_MODELSCOPE_DATA_ENDPOINT
+from modelscope.utils.file_utils import get_modelscope_cache_dir
 
-DEFAULT_CACHE_HOME = Path.home().joinpath('.cache')
-CACHE_HOME = os.getenv('CACHE_HOME', DEFAULT_CACHE_HOME)
-DEFAULT_MS_CACHE_HOME = os.path.join(CACHE_HOME, 'modelscope', 'hub')
-MS_CACHE_HOME = os.path.expanduser(
-    os.getenv('MS_CACHE_HOME', DEFAULT_MS_CACHE_HOME))
+MS_CACHE_HOME = get_modelscope_cache_dir()
 
-DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'datasets')
+DEFAULT_MS_DATASETS_CACHE = os.path.join(MS_CACHE_HOME, 'hub', 'datasets')
 MS_DATASETS_CACHE = Path(
     os.getenv('MS_DATASETS_CACHE', DEFAULT_MS_DATASETS_CACHE))
 
diff --git a/modelscope/utils/constant.py b/modelscope/utils/constant.py
index aba6e3822..ffc6f8167 100644
--- a/modelscope/utils/constant.py
+++ b/modelscope/utils/constant.py
@@ -5,11 +5,15 @@
 class Fields(object):
     """ Names for different application fields
     """
+    hub = 'hub'
+    datasets = 'datasets'
+    framework = 'framework'
     cv = 'cv'
     nlp = 'nlp'
     audio = 'audio'
     multi_modal = 'multi-modal'
     science = 'science'
+    server = 'server'
 
 
 class CVTasks(object):
@@ -32,6 +36,7 @@ class CVTasks(object):
     face_processing_base = 'face-processing-base'
     face_attribute_recognition = 'face-attribute-recognition'
     face_2d_keypoints = 'face-2d-keypoints'
+    facial_68ldk_detection = 'facial-68ldk-detection'
     human_detection = 'human-detection'
     human_object_interaction = 'human-object-interaction'
     face_image_generation = 'face-image-generation'
@@ -57,6 +62,8 @@ class CVTasks(object):
     semantic_segmentation = 'semantic-segmentation'
     image_driving_perception = 'image-driving-perception'
     image_depth_estimation = 'image-depth-estimation'
+    dense_optical_flow_estimation = 'dense-optical-flow-estimation'
+    image_normal_estimation = 'image-normal-estimation'
     indoor_layout_estimation = 'indoor-layout-estimation'
     video_depth_estimation = 'video-depth-estimation'
     panorama_depth_estimation = 'panorama-depth-estimation'
@@ -69,8 +76,11 @@ class CVTasks(object):
     face_emotion = 'face-emotion'
     product_segmentation = 'product-segmentation'
     image_matching = 'image-matching'
+    image_local_feature_matching = 'image-local-feature-matching'
     image_quality_assessment_degradation = 'image-quality-assessment-degradation'
 
+    human_normal_estimation = 'human-normal-estimation'
+
     crowd_counting = 'crowd-counting'
 
     # image editing
@@ -168,6 +178,10 @@ class CVTasks(object):
     human3d_render = 'human3d-render'
     human3d_animation = 'human3d-animation'
     image_control_3d_portrait = 'image-control-3d-portrait'
+    self_supervised_depth_completion = 'self-supervised-depth-completion'
+
+    # 3d generation
+    image_to_3d = 'image-to-3d'
 
     # vision efficient tuning
     vision_efficient_tuning = 'vision-efficient-tuning'
@@ -242,11 +256,13 @@ class AudioTasks(object):
     speaker_verification = 'speaker-verification'
     speech_language_recognition = 'speech-language-recognition'
     speaker_diarization = 'speaker-diarization'
+    audio_quantization = 'audio-quantization'
     voice_activity_detection = 'voice-activity-detection'
     language_score_prediction = 'language-score-prediction'
     speech_timestamp = 'speech-timestamp'
     speaker_diarization_dialogue_detection = 'speaker-diarization-dialogue-detection'
     speaker_diarization_semantic_speaker_turn_detection = 'speaker-diarization-semantic-speaker-turn-detection'
+    emotion_recognition = 'emotion-recognition'
 
 
 class MultiModalTasks(object):
@@ -277,6 +293,10 @@ class ScienceTasks(object):
     protein_structure = 'protein-structure'
 
 
+class Other(object):
+    other = 'other'
+
+
 class TasksIODescriptions(object):
     image_to_image = 'image_to_image',
     images_to_image = 'images_to_image',
@@ -294,7 +314,8 @@ class TasksIODescriptions(object):
     efficient_diffusion_tuning = 'efficient_diffusion_tuning'
 
 
-class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks, ScienceTasks):
+class Tasks(CVTasks, NLPTasks, AudioTasks, MultiModalTasks, ScienceTasks,
+            Other):
     """ Names for tasks supported by modelscope.
 
     Holds the standard task name to use for identifying different tasks.
@@ -385,9 +406,14 @@ class DatasetFormations(enum.Enum):
     # formation that is compatible with official huggingface dataset, which
     # organizes whole dataset into one single (zip) file.
     hf_compatible = 1
+
     # native modelscope formation that supports, among other things,
     # multiple files in a dataset
     native = 2
+
+    # general formation for datasets
+    general = 4
+
     # for local meta cache mark
     formation_mark_ext = '.formation_mark'
 
@@ -395,6 +421,7 @@ class DatasetFormations(enum.Enum):
 DatasetMetaFormats = {
     DatasetFormations.native: ['.json'],
     DatasetFormations.hf_compatible: ['.py'],
+    DatasetFormations.general: ['.py'],
 }
 
 
@@ -474,6 +501,9 @@ class Frameworks(object):
     kaldi = 'kaldi'
 
 
+REPO_TYPE_MODEL = 'model'
+REPO_TYPE_DATASET = 'dataset'
+REPO_TYPE_SUPPORT = [REPO_TYPE_MODEL, REPO_TYPE_DATASET]
 DEFAULT_MODEL_REVISION = None
 MASTER_MODEL_BRANCH = 'master'
 DEFAULT_REPOSITORY_REVISION = 'master'
diff --git a/modelscope/utils/cv/image_utils.py b/modelscope/utils/cv/image_utils.py
index 0efeae64d..8eea4dea5 100644
--- a/modelscope/utils/cv/image_utils.py
+++ b/modelscope/utils/cv/image_utils.py
@@ -7,6 +7,7 @@
 import matplotlib.cm as cm
 import matplotlib.pyplot as plt
 import numpy as np
+import torch.nn.functional as F
 from PIL import Image
 
 from modelscope.outputs import OutputKeys
@@ -16,6 +17,30 @@
 logger = logging.get_logger()
 
 
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [
+                pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2,
+                pad_ht - pad_ht // 2
+            ]
+        else:
+            self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+
+    def unpad(self, x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+
+
 def numpy_to_cv2img(img_array):
     """to convert a np.array with shape(h, w) to cv2 img
 
@@ -514,6 +539,127 @@ def depth_to_color(depth):
     return depth_color
 
 
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+
+    Returns:
+        np.ndarray: Color wheel
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY)
+    col = col + RY
+    # YG
+    colorwheel[col:col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG)
+    colorwheel[col:col + YG, 1] = 255
+    col = col + YG
+    # GC
+    colorwheel[col:col + GC, 1] = 255
+    colorwheel[col:col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC)
+    col = col + GC
+    # CB
+    colorwheel[col:col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB)
+    colorwheel[col:col + CB, 2] = 255
+    col = col + CB
+    # BM
+    colorwheel[col:col + BM, 2] = 255
+    colorwheel[col:col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM)
+    col = col + BM
+    # MR
+    colorwheel[col:col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR)
+    colorwheel[col:col + MR, 0] = 255
+    return colorwheel
+
+
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u) / np.pi
+    fk = (a + 1) / 2 * (ncols - 1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:, i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1 - f) * col0 + f * col1
+        idx = (rad <= 1)
+        col[idx] = 1 - rad[idx] * (1 - col[idx])
+        col[~idx] = col[~idx] * 0.75  # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2 - i if convert_to_bgr else i
+        flow_image[:, :, ch_idx] = np.floor(255 * col)
+    return flow_image
+
+
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:, :, 0]
+    v = flow_uv[:, :, 1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
+
+
+def flow_to_color(flow):
+    flow = flow[0].permute(1, 2, 0).cpu().numpy()
+    flow_color = flow_to_image(flow)
+    return flow_color
+
+
 def show_video_depth_estimation_result(depths, video_save_path):
     height, width, layers = depths[0].shape
     out = cv2.VideoWriter(video_save_path, cv2.VideoWriter_fourcc(*'MP4V'), 25,
diff --git a/modelscope/utils/deploy_checker.py b/modelscope/utils/deploy_checker.py
new file mode 100644
index 000000000..737766aac
--- /dev/null
+++ b/modelscope/utils/deploy_checker.py
@@ -0,0 +1,86 @@
+import argparse
+import traceback
+from typing import List, Union
+
+from modelscope.hub.api import HubApi
+from modelscope.hub.file_download import model_file_download
+from modelscope.pipelines import pipeline
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.input_output import (
+    call_pipeline_with_json, get_pipeline_information_by_pipeline,
+    get_task_input_examples, pipeline_output_to_service_base64_output)
+from modelscope.utils.logger import get_logger
+
+logger = get_logger()
+
+
+class DeployChecker:
+
+    def __init__(self):
+        self.api = HubApi()
+
+    def check_model(self, model_id: str, model_revision=None):
+        # get model_revision & task info
+        if not model_revision:
+            model_revisions = self.api.list_model_revisions(model_id)
+            logger.info(
+                f'All model_revisions of `{model_id}`: {model_revisions}')
+            if len(model_revisions):
+                model_revision = model_revisions[0]
+            else:
+                logger.error(f'{model_id} has no revision.')
+
+        configuration_file = model_file_download(
+            model_id=model_id,
+            file_path=ModelFile.CONFIGURATION,
+            revision=model_revision)
+        cfg = Config.from_file(configuration_file)
+        task = cfg.safe_get('task')
+
+        # init pipeline
+        ppl = pipeline(
+            task=task,
+            model=model_id,
+            model_revision=model_revision,
+            external_engine_for_llm=True)
+        pipeline_info = get_pipeline_information_by_pipeline(ppl)
+
+        # call pipeline
+        data = get_task_input_examples(task)
+
+        infer_result = call_pipeline_with_json(pipeline_info, ppl, data)
+        result = pipeline_output_to_service_base64_output(task, infer_result)
+        return result
+
+
+def check_deploy(models: Union[str, List], revisions: Union[str, List] = None):
+    if not isinstance(models, list):
+        models = [models]
+    if not isinstance(revisions, list):
+        revisions = [revisions] * (1 if revisions else len(models))
+
+    if len(models) != len(revisions):
+        logger.error(
+            f'The number of models and revisions need to be equal: The number of models'
+            f' is {len(model)} while the number of revisions is {len(revision)}.'
+        )
+
+    checker = DeployChecker()
+    for model, revision in zip(models, revisions):
+        try:
+            res = checker.check_model(model, revision)
+            logger.info(f'{model} {revision}: Deploy pre-check pass. {res}\n')
+        except BaseException as e:
+            logger.info(
+                f'{model} {revision}: Deploy pre-check failed: {e}. {traceback.print_exc()}\n'
+            )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_id', type=str)
+    parser.add_argument('--revision', type=str, default=None)
+    args = parser.parse_args()
+
+    check_deploy(args.model_id, args.revision)
diff --git a/modelscope/utils/file_utils.py b/modelscope/utils/file_utils.py
index 6bf376988..c00e8d266 100644
--- a/modelscope/utils/file_utils.py
+++ b/modelscope/utils/file_utils.py
@@ -31,7 +31,7 @@ def func_receive_dict_inputs(func):
     return False
 
 
-def get_default_cache_dir():
+def get_default_modelscope_cache_dir():
     """
     default base dir: '~/.cache/modelscope'
     """
@@ -39,6 +39,64 @@ def get_default_cache_dir():
     return default_cache_dir
 
 
+def get_modelscope_cache_dir() -> str:
+    """Get modelscope cache dir, default location or
+       setting with MODELSCOPE_CACHE
+
+    Returns:
+        str: the modelscope cache root.
+    """
+    return os.getenv('MODELSCOPE_CACHE', get_default_modelscope_cache_dir())
+
+
+def get_model_cache_root() -> str:
+    """Get model cache root path.
+
+    Returns:
+        str: the modelscope model cache root.
+    """
+    return os.path.join(get_modelscope_cache_dir(), 'hub')
+
+
+def get_dataset_cache_root() -> str:
+    """Get dataset raw file cache root path.
+
+    Returns:
+        str: the modelscope dataset raw file cache root.
+    """
+    return os.path.join(get_modelscope_cache_dir(), 'datasets')
+
+
+def get_dataset_cache_dir(dataset_id: str) -> str:
+    """Get the dataset_id's path.
+       dataset_cache_root/dataset_id.
+
+    Args:
+        dataset_id (str): The dataset id.
+
+    Returns:
+        str: The dataset_id's cache root path.
+    """
+    dataset_root = get_dataset_cache_root()
+    return dataset_root if dataset_id is None else os.path.join(
+        dataset_root, dataset_id + '/')
+
+
+def get_model_cache_dir(model_id: str) -> str:
+    """cache dir precedence:
+        function parameter > environment > ~/.cache/modelscope/hub/model_id
+
+    Args:
+        model_id (str, optional): The model id.
+
+    Returns:
+        str: the model_id dir if model_id not None, otherwise cache root dir.
+    """
+    root_path = get_model_cache_root()
+    return root_path if model_id is None else os.path.join(
+        root_path, model_id + '/')
+
+
 def read_file(path):
 
     with open(path, 'r') as f:
diff --git a/modelscope/utils/hf_util.py b/modelscope/utils/hf_util.py
index 5a81d52dd..8f7c06dac 100644
--- a/modelscope/utils/hf_util.py
+++ b/modelscope/utils/hf_util.py
@@ -1,28 +1,96 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+from pathlib import Path
+from types import MethodType
+from typing import Optional, Union
 
 from transformers import AutoConfig as AutoConfigHF
+from transformers import AutoFeatureExtractor as AutoFeatureExtractorHF
 from transformers import AutoImageProcessor as AutoImageProcessorHF
 from transformers import AutoModel as AutoModelHF
+from transformers import \
+    AutoModelForAudioClassification as AutoModelForAudioClassificationHF
 from transformers import AutoModelForCausalLM as AutoModelForCausalLMHF
+from transformers import \
+    AutoModelForDocumentQuestionAnswering as \
+    AutoModelForDocumentQuestionAnsweringHF
+from transformers import \
+    AutoModelForImageClassification as AutoModelForImageClassificationHF
+from transformers import \
+    AutoModelForImageSegmentation as AutoModelForImageSegmentationHF
+from transformers import \
+    AutoModelForInstanceSegmentation as AutoModelForInstanceSegmentationHF
+from transformers import \
+    AutoModelForMaskedImageModeling as AutoModelForMaskedImageModelingHF
+from transformers import AutoModelForMaskedLM as AutoModelForMaskedLMHF
+from transformers import \
+    AutoModelForMaskGeneration as AutoModelForMaskGenerationHF
+from transformers import \
+    AutoModelForObjectDetection as AutoModelForObjectDetectionHF
+from transformers import AutoModelForPreTraining as AutoModelForPreTrainingHF
+from transformers import \
+    AutoModelForQuestionAnswering as AutoModelForQuestionAnsweringHF
+from transformers import \
+    AutoModelForSemanticSegmentation as AutoModelForSemanticSegmentationHF
 from transformers import AutoModelForSeq2SeqLM as AutoModelForSeq2SeqLMHF
 from transformers import \
     AutoModelForSequenceClassification as AutoModelForSequenceClassificationHF
+from transformers import \
+    AutoModelForSpeechSeq2Seq as AutoModelForSpeechSeq2SeqHF
+from transformers import \
+    AutoModelForTableQuestionAnswering as AutoModelForTableQuestionAnsweringHF
+from transformers import AutoModelForTextEncoding as AutoModelForTextEncodingHF
 from transformers import \
     AutoModelForTokenClassification as AutoModelForTokenClassificationHF
+from transformers import \
+    AutoModelForUniversalSegmentation as AutoModelForUniversalSegmentationHF
+from transformers import AutoModelForVision2Seq as AutoModelForVision2SeqHF
+from transformers import \
+    AutoModelForVisualQuestionAnswering as \
+    AutoModelForVisualQuestionAnsweringHF
+from transformers import \
+    AutoModelForZeroShotImageClassification as \
+    AutoModelForZeroShotImageClassificationHF
+from transformers import \
+    AutoModelForZeroShotObjectDetection as \
+    AutoModelForZeroShotObjectDetectionHF
+from transformers import AutoProcessor as AutoProcessorHF
 from transformers import AutoTokenizer as AutoTokenizerHF
 from transformers import BatchFeature as BatchFeatureHF
 from transformers import BitsAndBytesConfig as BitsAndBytesConfigHF
 from transformers import GenerationConfig as GenerationConfigHF
-from transformers import PreTrainedModel, PreTrainedTokenizerBase
+from transformers import (PretrainedConfig, PreTrainedModel,
+                          PreTrainedTokenizerBase)
+from transformers import T5EncoderModel as T5EncoderModelHF
+from transformers import __version__ as transformers_version
 
 from modelscope import snapshot_download
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION, Invoke
+from .logger import get_logger
 
 try:
     from transformers import GPTQConfig as GPTQConfigHF
+    from transformers import AwqConfig as AwqConfigHF
 except ImportError:
     GPTQConfigHF = None
+    AwqConfigHF = None
+
+logger = get_logger()
+
+
+class UnsupportedAutoClass:
+
+    def __init__(self, name: str):
+        self.error_msg =\
+            f'{name} is not supported with your installed Transformers version {transformers_version}. ' + \
+            'Please update your Transformers by "pip install transformers -U".'
+
+    def from_pretrained(self, pretrained_model_name_or_path, *model_args,
+                        **kwargs):
+        raise ImportError(self.error_msg)
+
+    def from_config(self, cls, config):
+        raise ImportError(self.error_msg)
 
 
 def user_agent(invoked_by=None):
@@ -32,37 +100,73 @@ def user_agent(invoked_by=None):
     return uagent
 
 
-def patch_tokenizer_base():
-    """ Monkey patch PreTrainedTokenizerBase.from_pretrained to adapt to modelscope hub.
-    """
-    ori_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__
+def _try_login(token: Optional[str] = None):
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    if token is None:
+        token = os.environ.get('MODELSCOPE_API_TOKEN')
+    if token:
+        api.login(token)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
-                        **kwargs):
-        ignore_file_pattern = [r'\w+\.bin', r'\w+\.safetensors']
-        if not os.path.exists(pretrained_model_name_or_path):
-            revision = kwargs.pop('revision', None)
-            model_dir = snapshot_download(
-                pretrained_model_name_or_path,
-                revision=revision,
-                ignore_file_pattern=ignore_file_pattern)
-        else:
-            model_dir = pretrained_model_name_or_path
-        return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
 
-    PreTrainedTokenizerBase.from_pretrained = from_pretrained
+def _file_exists(
+    self,
+    repo_id: str,
+    filename: str,
+    *,
+    repo_type: Optional[str] = None,
+    revision: Optional[str] = None,
+    token: Union[str, bool, None] = None,
+):
+    """Patch huggingface_hub.file_exists"""
+    if repo_type is not None:
+        logger.warning(
+            'The passed in repo_type will not be used in modelscope. Now only model repo can be queried.'
+        )
+    _try_login(token)
+    from modelscope.hub.api import HubApi
+    api = HubApi()
+    return api.file_exists(repo_id, filename, revision=revision)
 
 
-def patch_model_base():
-    """ Monkey patch PreTrainedModel.from_pretrained to adapt to modelscope hub.
-    """
-    ori_from_pretrained = PreTrainedModel.from_pretrained.__func__
+def _file_download(repo_id: str,
+                   filename: str,
+                   *,
+                   subfolder: Optional[str] = None,
+                   repo_type: Optional[str] = None,
+                   revision: Optional[str] = None,
+                   cache_dir: Union[str, Path, None] = None,
+                   local_dir: Union[str, Path, None] = None,
+                   token: Union[bool, str, None] = None,
+                   local_files_only: bool = False,
+                   **kwargs):
+    """Patch huggingface_hub.hf_hub_download"""
+    if len(kwargs) > 0:
+        logger.warning(
+            'The passed in library_name,library_version,user_agent,force_download,proxies'
+            'etag_timeout,headers,endpoint '
+            'will not be used in modelscope.')
+    assert repo_type in (
+        None, 'model',
+        'dataset'), f'repo_type={repo_type} is not supported in ModelScope'
+    if repo_type in (None, 'model'):
+        from modelscope.hub.file_download import model_file_download as file_download
+    else:
+        from modelscope.hub.file_download import dataset_file_download as file_download
+    _try_login(token)
+    return file_download(
+        repo_id,
+        file_path=os.path.join(subfolder, filename) if subfolder else filename,
+        cache_dir=cache_dir,
+        local_dir=local_dir,
+        local_files_only=local_files_only,
+        revision=revision)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
-                        **kwargs):
-        ignore_file_pattern = [r'\w+\.safetensors']
+
+def _patch_pretrained_class():
+
+    def get_model_dir(pretrained_model_name_or_path, ignore_file_pattern,
+                      **kwargs):
         if not os.path.exists(pretrained_model_name_or_path):
             revision = kwargs.pop('revision', None)
             model_dir = snapshot_download(
@@ -71,16 +175,138 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                 ignore_file_pattern=ignore_file_pattern)
         else:
             model_dir = pretrained_model_name_or_path
-        return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+        return model_dir
+
+    def patch_tokenizer_base():
+        """ Monkey patch PreTrainedTokenizerBase.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = PreTrainedTokenizerBase.from_pretrained.__func__
 
-    PreTrainedModel.from_pretrained = from_pretrained
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            ignore_file_pattern = [
+                r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+            ]
+            model_dir = get_model_dir(pretrained_model_name_or_path,
+                                      ignore_file_pattern, **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
 
+        PreTrainedTokenizerBase.from_pretrained = from_pretrained
 
-patch_tokenizer_base()
-patch_model_base()
+    def patch_config_base():
+        """ Monkey patch PretrainedConfig.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = PretrainedConfig.from_pretrained.__func__
+        ori_get_config_dict = PretrainedConfig.get_config_dict.__func__
 
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            ignore_file_pattern = [
+                r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+            ]
+            model_dir = get_model_dir(pretrained_model_name_or_path,
+                                      ignore_file_pattern, **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        @classmethod
+        def get_config_dict(cls, pretrained_model_name_or_path, **kwargs):
+            ignore_file_pattern = [
+                r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt'
+            ]
+            model_dir = get_model_dir(pretrained_model_name_or_path,
+                                      ignore_file_pattern, **kwargs)
+            return ori_get_config_dict(cls, model_dir, **kwargs)
 
-def get_wrapped_class(module_class, ignore_file_pattern=[], **kwargs):
+        PretrainedConfig.from_pretrained = from_pretrained
+        PretrainedConfig.get_config_dict = get_config_dict
+
+    def patch_model_base():
+        """ Monkey patch PreTrainedModel.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = PreTrainedModel.from_pretrained.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            model_dir = get_model_dir(pretrained_model_name_or_path, None,
+                                      **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        PreTrainedModel.from_pretrained = from_pretrained
+
+    def patch_image_processor_base():
+        """ Monkey patch AutoImageProcessorHF.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = AutoImageProcessorHF.from_pretrained.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            model_dir = get_model_dir(pretrained_model_name_or_path, None,
+                                      **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        AutoImageProcessorHF.from_pretrained = from_pretrained
+
+    def patch_auto_processor_base():
+        """ Monkey patch AutoProcessorHF.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = AutoProcessorHF.from_pretrained.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            model_dir = get_model_dir(pretrained_model_name_or_path, None,
+                                      **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        AutoProcessorHF.from_pretrained = from_pretrained
+
+    def patch_feature_extractor_base():
+        """ Monkey patch AutoFeatureExtractorHF.from_pretrained to adapt to modelscope hub.
+        """
+        ori_from_pretrained = AutoFeatureExtractorHF.from_pretrained.__func__
+
+        @classmethod
+        def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
+                            **kwargs):
+            model_dir = get_model_dir(pretrained_model_name_or_path, None,
+                                      **kwargs)
+            return ori_from_pretrained(cls, model_dir, *model_args, **kwargs)
+
+        AutoFeatureExtractorHF.from_pretrained = from_pretrained
+
+    patch_tokenizer_base()
+    patch_config_base()
+    patch_model_base()
+    patch_image_processor_base()
+    patch_auto_processor_base()
+    patch_feature_extractor_base()
+
+
+def patch_hub():
+    """Patch hf hub, which to make users can download models from modelscope to speed up.
+    """
+    import huggingface_hub
+    from huggingface_hub import hf_api
+    from huggingface_hub.hf_api import api
+
+    huggingface_hub.hf_hub_download = _file_download
+    huggingface_hub.file_download.hf_hub_download = _file_download
+
+    hf_api.file_exists = MethodType(_file_exists, api)
+    huggingface_hub.file_exists = hf_api.file_exists
+    huggingface_hub.hf_api.file_exists = hf_api.file_exists
+
+    _patch_pretrained_class()
+
+
+def get_wrapped_class(module_class,
+                      ignore_file_pattern=[],
+                      file_filter=None,
+                      **kwargs):
     """Get a custom wrapper class for  auto classes to download the models from the ModelScope hub
     Args:
         module_class: The actual module class
@@ -90,6 +316,7 @@ def get_wrapped_class(module_class, ignore_file_pattern=[], **kwargs):
         The wrapper
     """
     default_ignore_file_pattern = ignore_file_pattern
+    default_file_filter = file_filter
 
     class ClassWrapper(module_class):
 
@@ -98,13 +325,26 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
                             **kwargs):
             ignore_file_pattern = kwargs.pop('ignore_file_pattern',
                                              default_ignore_file_pattern)
+            subfolder = kwargs.pop('subfolder', default_file_filter)
+            file_filter = None
+            if subfolder:
+                file_filter = f'{subfolder}/*'
             if not os.path.exists(pretrained_model_name_or_path):
                 revision = kwargs.pop('revision', DEFAULT_MODEL_REVISION)
-                model_dir = snapshot_download(
-                    pretrained_model_name_or_path,
-                    revision=revision,
-                    ignore_file_pattern=ignore_file_pattern,
-                    user_agent=user_agent())
+                if file_filter is None:
+                    model_dir = snapshot_download(
+                        pretrained_model_name_or_path,
+                        revision=revision,
+                        ignore_file_pattern=ignore_file_pattern,
+                        user_agent=user_agent())
+                else:
+                    model_dir = os.path.join(
+                        snapshot_download(
+                            pretrained_model_name_or_path,
+                            revision=revision,
+                            ignore_file_pattern=ignore_file_pattern,
+                            allow_file_pattern=file_filter,
+                            user_agent=user_agent()), subfolder)
             else:
                 model_dir = pretrained_model_name_or_path
 
@@ -123,18 +363,106 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args,
 AutoModel = get_wrapped_class(AutoModelHF)
 AutoModelForCausalLM = get_wrapped_class(AutoModelForCausalLMHF)
 AutoModelForSeq2SeqLM = get_wrapped_class(AutoModelForSeq2SeqLMHF)
+AutoModelForVision2Seq = get_wrapped_class(AutoModelForVision2SeqHF)
 AutoModelForSequenceClassification = get_wrapped_class(
     AutoModelForSequenceClassificationHF)
 AutoModelForTokenClassification = get_wrapped_class(
     AutoModelForTokenClassificationHF)
+AutoModelForImageSegmentation = get_wrapped_class(
+    AutoModelForImageSegmentationHF)
+AutoModelForImageClassification = get_wrapped_class(
+    AutoModelForImageClassificationHF)
+AutoModelForZeroShotImageClassification = get_wrapped_class(
+    AutoModelForZeroShotImageClassificationHF)
+try:
+    from transformers import AutoModelForImageToImage as AutoModelForImageToImageHF
+    AutoModelForImageToImage = get_wrapped_class(AutoModelForImageToImageHF)
+except ImportError:
+    AutoModelForImageToImage = UnsupportedAutoClass('AutoModelForImageToImage')
+
+try:
+    from transformers import AutoModelForImageTextToText as AutoModelForImageTextToTextHF
+    AutoModelForImageTextToText = get_wrapped_class(
+        AutoModelForImageTextToTextHF)
+except ImportError:
+    AutoModelForImageTextToText = UnsupportedAutoClass(
+        'AutoModelForImageTextToText')
+
+try:
+    from transformers import AutoModelForKeypointDetection as AutoModelForKeypointDetectionHF
+    AutoModelForKeypointDetection = get_wrapped_class(
+        AutoModelForKeypointDetectionHF)
+except ImportError:
+    AutoModelForKeypointDetection = UnsupportedAutoClass(
+        'AutoModelForKeypointDetection')
+
+AutoModelForQuestionAnswering = get_wrapped_class(
+    AutoModelForQuestionAnsweringHF)
+AutoModelForTableQuestionAnswering = get_wrapped_class(
+    AutoModelForTableQuestionAnsweringHF)
+AutoModelForVisualQuestionAnswering = get_wrapped_class(
+    AutoModelForVisualQuestionAnsweringHF)
+AutoModelForDocumentQuestionAnswering = get_wrapped_class(
+    AutoModelForDocumentQuestionAnsweringHF)
+AutoModelForSemanticSegmentation = get_wrapped_class(
+    AutoModelForSemanticSegmentationHF)
+AutoModelForUniversalSegmentation = get_wrapped_class(
+    AutoModelForUniversalSegmentationHF)
+AutoModelForInstanceSegmentation = get_wrapped_class(
+    AutoModelForInstanceSegmentationHF)
+AutoModelForObjectDetection = get_wrapped_class(AutoModelForObjectDetectionHF)
+AutoModelForZeroShotObjectDetection = get_wrapped_class(
+    AutoModelForZeroShotObjectDetectionHF)
+AutoModelForAudioClassification = get_wrapped_class(
+    AutoModelForAudioClassificationHF)
+AutoModelForSpeechSeq2Seq = get_wrapped_class(AutoModelForSpeechSeq2SeqHF)
+AutoModelForMaskedImageModeling = get_wrapped_class(
+    AutoModelForMaskedImageModelingHF)
+AutoModelForMaskedLM = get_wrapped_class(AutoModelForMaskedLMHF)
+AutoModelForMaskGeneration = get_wrapped_class(AutoModelForMaskGenerationHF)
+AutoModelForPreTraining = get_wrapped_class(AutoModelForPreTrainingHF)
+AutoModelForTextEncoding = get_wrapped_class(AutoModelForTextEncodingHF)
+T5EncoderModel = get_wrapped_class(T5EncoderModelHF)
+try:
+    from transformers import \
+        Qwen2VLForConditionalGeneration as Qwen2VLForConditionalGenerationHF
+    Qwen2VLForConditionalGeneration = get_wrapped_class(
+        Qwen2VLForConditionalGenerationHF)
+except ImportError:
+    Qwen2VLForConditionalGeneration = UnsupportedAutoClass(
+        'Qwen2VLForConditionalGeneration')
 
 AutoTokenizer = get_wrapped_class(
-    AutoTokenizerHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
+    AutoTokenizerHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt', r'\w+\.h5'
+    ])
+AutoProcessor = get_wrapped_class(
+    AutoProcessorHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt', r'\w+\.h5'
+    ])
 AutoConfig = get_wrapped_class(
-    AutoConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
+    AutoConfigHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt', r'\w+\.h5'
+    ])
 GenerationConfig = get_wrapped_class(
-    GenerationConfigHF, ignore_file_pattern=[r'\w+\.bin', r'\w+\.safetensors'])
+    GenerationConfigHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt', r'\w+\.h5'
+    ])
+BitsAndBytesConfig = get_wrapped_class(
+    BitsAndBytesConfigHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt', r'\w+\.h5'
+    ])
+AutoImageProcessor = get_wrapped_class(
+    AutoImageProcessorHF,
+    ignore_file_pattern=[
+        r'\w+\.bin', r'\w+\.safetensors', r'\w+\.pth', r'\w+\.pt', r'\w+\.h5'
+    ])
+
 GPTQConfig = GPTQConfigHF
-BitsAndBytesConfig = BitsAndBytesConfigHF
-AutoImageProcessor = get_wrapped_class(AutoImageProcessorHF)
+AwqConfig = AwqConfigHF
 BatchFeature = get_wrapped_class(BatchFeatureHF)
diff --git a/modelscope/utils/hub.py b/modelscope/utils/hub.py
index 20fb0e207..39ca644a2 100644
--- a/modelscope/utils/hub.py
+++ b/modelscope/utils/hub.py
@@ -22,15 +22,9 @@ def create_model_if_not_exist(
         model_id: str,
         chinese_name: str,
         visibility: Optional[int] = ModelVisibility.PUBLIC,
-        license: Optional[str] = Licenses.APACHE_V2,
-        revision: Optional[str] = DEFAULT_MODEL_REVISION):
-    exists = True
-    try:
-        api.get_model(model_id=model_id, revision=revision)
-    except HTTPError:
-        exists = False
-    if exists:
-        print(f'model {model_id} already exists, skip creation.')
+        license: Optional[str] = Licenses.APACHE_V2):
+    if api.repo_exists(model_id):
+        logger.info(f'model {model_id} already exists, skip creation.')
         return False
     else:
         api.create_model(
@@ -39,7 +33,7 @@ def create_model_if_not_exist(
             license=license,
             chinese_name=chinese_name,
         )
-        print(f'model {model_id} successfully created.')
+        logger.info(f'model {model_id} successfully created.')
         return True
 
 
diff --git a/modelscope/utils/import_utils.py b/modelscope/utils/import_utils.py
index 8e707f061..a32976840 100644
--- a/modelscope/utils/import_utils.py
+++ b/modelscope/utils/import_utils.py
@@ -3,6 +3,7 @@
 import ast
 import functools
 import importlib
+import logging
 import os
 import os.path as osp
 import sys
@@ -13,8 +14,6 @@
 from types import ModuleType
 from typing import Any
 
-from packaging import version
-
 from modelscope.utils.ast_utils import (INDEX_KEY, MODULE_KEY, REQUIREMENT_KEY,
                                         load_index)
 from modelscope.utils.error import *  # noqa
@@ -25,9 +24,7 @@
 else:
     import importlib.metadata as importlib_metadata
 
-logger = get_logger()
-
-AST_INDEX = None
+logger = get_logger(log_level=logging.WARNING)
 
 
 def import_modules_from_file(py_file: str):
@@ -192,6 +189,7 @@ def validate_py_syntax(filename):
                 pass
         _tf_available = _tf_version is not None
     if _tf_available:
+        from packaging import version
         if version.parse(_tf_version) < version.parse('2'):
             pass
         else:
@@ -378,9 +376,7 @@ def wrapper(*args, **kwargs):
 
 
 class LazyImportModule(ModuleType):
-    AST_INDEX = None
-    if AST_INDEX is None:
-        AST_INDEX = load_index()
+    _AST_INDEX = None
 
     def __init__(self,
                  name,
@@ -442,12 +438,15 @@ def __getattr__(self, name: str) -> Any:
 
     def _get_module(self, module_name: str):
         try:
-            # check requirements before module import
             module_name_full = self.__name__ + '.' + module_name
-            if module_name_full in LazyImportModule.AST_INDEX[REQUIREMENT_KEY]:
-                requirements = LazyImportModule.AST_INDEX[REQUIREMENT_KEY][
-                    module_name_full]
-                requires(module_name_full, requirements)
+            if not any(
+                    module_name_full.startswith(f'modelscope.{prefix}')
+                    for prefix in ['hub', 'utils', 'version', 'fileio']):
+                # check requirements before module import
+                ast_index = self.get_ast_index()
+                if module_name_full in ast_index[REQUIREMENT_KEY]:
+                    requirements = ast_index[REQUIREMENT_KEY][module_name_full]
+                    requires(module_name_full, requirements)
             return importlib.import_module('.' + module_name, self.__name__)
         except Exception as e:
             raise RuntimeError(
@@ -458,6 +457,12 @@ def __reduce__(self):
         return self.__class__, (self._name, self.__file__,
                                 self._import_structure)
 
+    @staticmethod
+    def get_ast_index():
+        if LazyImportModule._AST_INDEX is None:
+            LazyImportModule._AST_INDEX = load_index()
+        return LazyImportModule._AST_INDEX
+
     @staticmethod
     def import_module(signature):
         """ import a lazy import module using signature
@@ -465,12 +470,12 @@ def import_module(signature):
         Args:
             signature (tuple): a tuple of str, (registry_name, registry_group_name, module_name)
         """
-        if signature in LazyImportModule.AST_INDEX[INDEX_KEY]:
-            mod_index = LazyImportModule.AST_INDEX[INDEX_KEY][signature]
+        ast_index = LazyImportModule.get_ast_index()
+        if signature in ast_index[INDEX_KEY]:
+            mod_index = ast_index[INDEX_KEY][signature]
             module_name = mod_index[MODULE_KEY]
-            if module_name in LazyImportModule.AST_INDEX[REQUIREMENT_KEY]:
-                requirements = LazyImportModule.AST_INDEX[REQUIREMENT_KEY][
-                    module_name]
+            if module_name in ast_index[REQUIREMENT_KEY]:
+                requirements = ast_index[REQUIREMENT_KEY][module_name]
                 requires(module_name, requirements)
             importlib.import_module(module_name)
         else:
diff --git a/modelscope/utils/input_output.py b/modelscope/utils/input_output.py
index 679069c18..59128eccd 100644
--- a/modelscope/utils/input_output.py
+++ b/modelscope/utils/input_output.py
@@ -8,12 +8,9 @@
 from typing import Any
 from urllib.parse import urlparse
 
-import cv2
 import json
 import numpy as np
 
-from modelscope.hub.api import HubApi
-from modelscope.hub.errors import NotExistError
 from modelscope.hub.file_download import model_file_download
 from modelscope.outputs.outputs import (TASK_OUTPUTS, OutputKeys, OutputTypes,
                                         OutputTypeSchema)
@@ -36,16 +33,18 @@
 Example:
     # create pipeine instance and pipeline information, save it to app
     pipeline_instance = create_pipeline('damo/cv_gpen_image-portrait-enhancement', 'v1.0.0')
+    # get pipeline information, input,output, request example.
     pipeline_info = get_pipeline_information_by_pipeline(pipeline_instance)
+    # save the pipeline and info to the app for use in subsequent request processing
     app.state.pipeline = pipeline_instance
     app.state.pipeline_info = pipeline_info
 
-    # for service schema request.
-    pipeline_info = request.app.state.pipeline_info
-    return pipeline_info.schema
-
-    # for service call request.
-    def inference(request: Request):
+    # for inference request, use call_pipeline_with_json to decode input and
+    # call pipeline, call pipeline_output_to_service_base64_output
+    # to encode necessary fields, and return the result.
+    # request and response are json format.
+    @router.post('/call')
+    async def inference(request: Request):
         pipeline_service = request.app.state.pipeline
         pipeline_info = request.app.state.pipeline_info
         request_json = await request.json()
@@ -55,19 +54,32 @@ def inference(request: Request):
         # convert output to json, if binary field, we need encoded.
         output = pipeline_output_to_service_base64_output(pipeline_info.task_name, result)
         return output
+
+    # Inference service input and output and sample information can be obtained through the docs interface
+    @router.get('/describe')
+    async def index(request: Request):
+        pipeline_info = request.app.state.pipeline_info
+        return pipeline_info.schema
+
 Todo:
     * Support more service input type, such as form.
 
 """
 
 
-def create_pipeline(model_id: str, revision: str):
+def create_pipeline(model_id: str,
+                    revision: str,
+                    external_engine_for_llm: bool = True):
     model_configuration_file = model_file_download(
         model_id=model_id,
         file_path=ModelFile.CONFIGURATION,
         revision=revision)
     cfg = Config.from_file(model_configuration_file)
-    return pipeline(task=cfg.task, model=model_id, model_revision=revision)
+    return pipeline(
+        task=cfg.task,
+        model=model_id,
+        model_revision=revision,
+        external_engine_for_llm=external_engine_for_llm)
 
 
 def get_class_user_attributes(cls):
@@ -534,6 +546,9 @@ def schema(self):
             },
         }
 
+    def __getitem__(self, key):
+        return self.__dict__.get('_%s' % key)
+
 
 def is_url(url: str):
     """Check the input url is valid url.
@@ -632,7 +647,7 @@ def call_pipeline_with_json(pipeline_info: PipelineInfomation,
     #     result = pipeline(**pipeline_inputs)
     # else:
     pipeline_inputs, parameters = service_base64_input_to_pipeline_input(
-        pipeline_info.task_name, body)
+        pipeline_info['task_name'], body)
     result = pipeline(pipeline_inputs, **parameters)
 
     return result
@@ -698,6 +713,7 @@ def service_base64_input_to_pipeline_input(task_name, body):
 
 
 def encode_numpy_image_to_base64(image):
+    import cv2
     _, img_encode = cv2.imencode('.png', image)
     bytes_data = img_encode.tobytes()
     base64_str = str(base64.b64encode(bytes_data), 'utf-8')
@@ -773,7 +789,12 @@ def pipeline_output_to_service_base64_output(task_name, pipeline_output):
         pipeline_output = pipeline_output[0]
     for key, value in pipeline_output.items():
         if key not in task_outputs:
-            json_serializable_output[key] = value
+            import torch
+            if isinstance(value, torch.Tensor):
+                v = np.array(value.cpu()).tolist()
+            else:
+                v = value
+            json_serializable_output[key] = v
             continue  # skip the output not defined.
         if key in [
                 OutputKeys.OUTPUT_IMG, OutputKeys.OUTPUT_IMGS,
diff --git a/modelscope/utils/logger.py b/modelscope/utils/logger.py
index 48f1dbeee..bc471044a 100644
--- a/modelscope/utils/logger.py
+++ b/modelscope/utils/logger.py
@@ -1,7 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 
-import importlib
+import importlib.util as iutil
 import logging
+import os
 from typing import Optional
 
 init_loggers = {}
@@ -9,9 +10,11 @@
 formatter = logging.Formatter(
     '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
 
+default_log_level = int(os.getenv('MODELSCOPE_LOG_LEVEL', str(logging.INFO)))
+
 
 def get_logger(log_file: Optional[str] = None,
-               log_level: int = logging.INFO,
+               log_level: int = default_log_level,
                file_mode: str = 'w'):
     """ Get logging logger
 
@@ -28,6 +31,8 @@ def get_logger(log_file: Optional[str] = None,
     logger.propagate = False
     if logger_name in init_loggers:
         add_file_handler_if_needed(logger, log_file, file_mode, log_level)
+        if logger.level != log_level:
+            logger.setLevel(log_level)
         return logger
 
     # handle duplicate logs to the console
@@ -39,7 +44,7 @@ def get_logger(log_file: Optional[str] = None,
     # at the ERROR level.
     torch_dist = False
     is_worker0 = True
-    if importlib.util.find_spec('torch') is not None:
+    if iutil.find_spec('torch') is not None:
         from modelscope.utils.torch_utils import is_dist, is_master
         torch_dist = is_dist()
         is_worker0 = is_master()
@@ -76,7 +81,7 @@ def add_file_handler_if_needed(logger, log_file, file_mode, log_level):
         if isinstance(handler, logging.FileHandler):
             return
 
-    if importlib.util.find_spec('torch') is not None:
+    if iutil.find_spec('torch') is not None:
         from modelscope.utils.torch_utils import is_master
         is_worker0 = is_master()
     else:
diff --git a/modelscope/utils/model_type_helper.py b/modelscope/utils/model_type_helper.py
new file mode 100644
index 000000000..be4ff3a12
--- /dev/null
+++ b/modelscope/utils/model_type_helper.py
@@ -0,0 +1,68 @@
+import os.path as osp
+from typing import Optional
+
+from modelscope.hub.file_download import model_file_download
+from modelscope.utils.config import Config
+from modelscope.utils.constant import ModelFile
+
+
+class ModelTypeHelper:
+
+    current_model_type = None
+
+    @staticmethod
+    def _get_file_name(model: str, cfg_name: str,
+                       revision: Optional[str]) -> Optional[str]:
+        if osp.exists(model):
+            return osp.join(model, cfg_name)
+        try:
+            return model_file_download(model, cfg_name, revision=revision)
+        except Exception:
+            return None
+
+    @staticmethod
+    def _parse_and_get(file: Optional[str], pattern: str) -> Optional[str]:
+        if file is None or not osp.exists(file):
+            return None
+        return Config.from_file(file).safe_get(pattern)
+
+    @classmethod
+    def _get(cls, model: str, revision: Optional[str]) -> Optional[str]:
+        cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision)
+        hf_cfg_file = cls._get_file_name(model, ModelFile.CONFIG, revision)
+        cfg_model_type = cls._parse_and_get(cfg_file, 'model.type')
+        hf_cfg_model_type = cls._parse_and_get(hf_cfg_file, 'model_type')
+        return cfg_model_type or hf_cfg_model_type
+
+    @classmethod
+    def _get_adapter(cls, model: str,
+                     revision: Optional[str]) -> Optional[str]:
+        cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision)
+        model = cls._parse_and_get(cfg_file, 'adapter_cfg.model_id_or_path')
+        revision = cls._parse_and_get(cfg_file, 'adapter_cfg.model_revision')
+        return None if model is None else cls._get(model, revision)
+
+    @classmethod
+    def get(cls,
+            model: str,
+            revision: Optional[str] = None,
+            with_adapter: bool = False,
+            split: Optional[str] = None,
+            use_cache: bool = False) -> Optional[str]:
+        if use_cache and cls.current_model_type:
+            return cls.current_model_type
+        model_type = cls._get(model, revision)
+        if model_type is None and with_adapter:
+            model_type = cls._get_adapter(model, revision)
+        if model_type is None:
+            return None
+        model_type = model_type.lower()
+        if split is not None:
+            model_type = model_type.split(split)[0]
+        if use_cache:
+            cls.current_model_type = model_type
+        return model_type
+
+    @classmethod
+    def clear_cache(cls):
+        cls.current_model_type = None
diff --git a/modelscope/utils/pipeline_schema.json b/modelscope/utils/pipeline_schema.json
index cf5c7fb7d..92f9dd597 100644
--- a/modelscope/utils/pipeline_schema.json
+++ b/modelscope/utils/pipeline_schema.json
@@ -137,6 +137,27 @@
             }
         }
     },
+    "audio-quantization": {
+        "input": {
+            "type": "object",
+            "properties": {
+                "wav": {
+                    "type": "string",
+                    "description": "Base64 encoded audio file or url string.."
+                }
+            }
+        },
+        "parameters": {},
+        "output": {
+            "type": "object",
+            "properties": {
+                "output_wav": {
+                    "type": "string",
+                    "description": "The base64 encoded WAV."
+                }
+            }
+        }
+    },
     "bad-image-detecting": {
         "input": {
             "type": "object",
@@ -1144,6 +1165,27 @@
             "type": "object"
         }
     },
+    "dense-optical-flow-estimation": {
+        "input": {},
+        "parameters": {},
+        "output": {
+            "type": "object"
+        }
+    },
+    "image-normal-estimation": {
+        "input": {},
+        "parameters": {},
+        "output": {
+            "type": "object"
+        }
+    },
+    "human-normal-estimation": {
+        "input": {},
+        "parameters": {},
+        "output": {
+            "type": "object"
+        }
+    },
     "image-driving-perception": {
         "input": {
             "type": "object",
@@ -1253,6 +1295,13 @@
             "type": "object"
         }
     },
+    "image-local-feature-matching": {
+        "input": {},
+        "parameters": {},
+        "output": {
+            "type": "object"
+        }
+    },
     "image-multi-view-depth-estimation": {
         "input": {},
         "parameters": {},
@@ -3777,5 +3826,18 @@
                 }
             }
         }
+    },
+    "self-supervised-depth-completion": {
+        "input": {},
+        "parameters": {},
+        "output": {
+            "type": "object",
+            "properties": {
+                "output_img": {
+                    "type": "string",
+                    "description":"The base64 encoded image."
+                }
+            }
+        }
     }
 }
diff --git a/modelscope/utils/plugins.py b/modelscope/utils/plugins.py
index b4485830e..1f191a8de 100644
--- a/modelscope/utils/plugins.py
+++ b/modelscope/utils/plugins.py
@@ -17,17 +17,17 @@
 import json
 import pkg_resources
 
+from modelscope import snapshot_download
 from modelscope.fileio.file import LocalStorage
 from modelscope.utils.ast_utils import FilesAstScanning
 from modelscope.utils.constant import DEFAULT_MODEL_REVISION
-from modelscope.utils.file_utils import get_default_cache_dir
-from modelscope.utils.hub import read_config, snapshot_download
+from modelscope.utils.file_utils import get_modelscope_cache_dir
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
 storage = LocalStorage()
 
-MODELSCOPE_FILE_DIR = get_default_cache_dir()
+MODELSCOPE_FILE_DIR = get_modelscope_cache_dir()
 MODELSCOPE_DYNAMIC_MODULE = 'modelscope_modules'
 BASE_MODULE_DIR = os.path.join(MODELSCOPE_FILE_DIR, MODELSCOPE_DYNAMIC_MODULE)
 
@@ -395,8 +395,8 @@ def import_module_from_model_dir(model_dir):
     ]
     create_module_from_files(relative_file_dirs, model_dir, module_name)
     for file in relative_file_dirs:
-        submodule = module_name + '.' + file.replace(os.sep, '.').replace(
-            '.py', '')
+        submodule = module_name + '.' + file.replace('.py', '').replace(
+            os.sep, '.')
         importlib.import_module(submodule)
 
 
@@ -1140,6 +1140,7 @@ def __init__(self,
         cache_dir = os.getenv('MODELSCOPE_CACHE', cache_dir)
         self.env_dir = os.path.join(cache_dir, EnvsManager.name, model_id)
         model_dir = snapshot_download(model_id, revision=model_revision)
+        from modelscope.utils.hub import read_config
         cfg = read_config(model_dir)
         self.plugins = cfg.get('plugins', [])
         self.allow_remote = cfg.get('allow_remote', False)
diff --git a/modelscope/utils/pre_compile.py b/modelscope/utils/pre_compile.py
index 2d9d3b0d9..6415f6773 100644
--- a/modelscope/utils/pre_compile.py
+++ b/modelscope/utils/pre_compile.py
@@ -18,10 +18,10 @@ def pre_compile_megatron_util():
 
 def pre_compile_all():
     if torch.cuda.is_available():  # extension require cuda.
-        pre_compile_megatron_util()
         # pre compile pai-easycv
         from easycv.thirdparty.deformable_attention.functions import ms_deform_attn_func
         # extension for all platform.
+        pre_compile_megatron_util()
 
 
 if __name__ == '__main__':
diff --git a/modelscope/utils/registry.py b/modelscope/utils/registry.py
index 38071bb8f..e6556d9c2 100644
--- a/modelscope/utils/registry.py
+++ b/modelscope/utils/registry.py
@@ -9,7 +9,6 @@
 TYPE_NAME = 'type'
 default_group = 'default'
 logger = get_logger()
-AST_INDEX = None
 
 
 class Registry(object):
diff --git a/modelscope/utils/streaming_output.py b/modelscope/utils/streaming_output.py
index f54071495..96dad20fe 100644
--- a/modelscope/utils/streaming_output.py
+++ b/modelscope/utils/streaming_output.py
@@ -6,6 +6,8 @@
 
 import torch
 import torch.distributed as dist
+import transformers
+from packaging import version
 from torch import nn
 from transformers import PreTrainedModel
 from transformers.generation import GreedySearchDecoderOnlyOutput  # noqa
@@ -173,16 +175,24 @@ def stream_generate(self, *args, **kwargs) -> Generator:
 
     @contextmanager
     def _replace_generate(self, model: PreTrainedModel) -> Generator:
-        greedy_search = model.greedy_search
-        sample = model.sample
-        model.greedy_search = types.MethodType(self._greedy_search, model)
-        model.sample = types.MethodType(self._sample, model)
+        if version.parse(transformers.__version__) >= version.parse('4.39.0'):
+            greedy_search_name = '_greedy_search'
+            sample_name = '_sample'
+        else:
+            greedy_search_name = 'greedy_search'
+            sample_name = 'sample'
+        origin_greedy_search = getattr(model, greedy_search_name)
+        origin_sample = getattr(model, sample_name)
+        setattr(model, greedy_search_name,
+                types.MethodType(self.stream_greedy_search, model))
+        setattr(model, sample_name, types.MethodType(self.stream_sample,
+                                                     model))
         yield
-        model.greedy_search = greedy_search
-        model.sample = sample
+        setattr(model, greedy_search_name, origin_greedy_search)
+        setattr(model, sample_name, origin_sample)
 
     @staticmethod
-    def _greedy_search(
+    def stream_greedy_search(
         self,
         input_ids: torch.LongTensor,
         logits_processor: Optional[LogitsProcessorList] = None,
@@ -356,7 +366,7 @@ def _greedy_search(
                 break
 
     @staticmethod
-    def _sample(
+    def stream_sample(
         self,
         input_ids: torch.LongTensor,
         logits_processor: Optional[LogitsProcessorList] = None,
diff --git a/modelscope/utils/test_utils.py b/modelscope/utils/test_utils.py
index bc7b43119..3859be612 100644
--- a/modelscope/utils/test_utils.py
+++ b/modelscope/utils/test_utils.py
@@ -104,7 +104,7 @@ def download_and_untar(fpath, furl, dst) -> str:
 
 def get_case_model_info():
     status_code, result = subprocess.getstatusoutput(
-        'grep -rn "damo/" tests/  | grep -v ".pyc" | grep -v "Binary file" | grep -v run.py '
+        'grep -rn "damo/" tests/  | grep -v "*.pyc" | grep -v "Binary file" | grep -v run.py '
     )
     lines = result.split('\n')
     test_cases = OrderedDict()
@@ -116,7 +116,6 @@ def get_case_model_info():
         test_file = elements[0]
         model_pos = line.find('damo')
         if model_pos == -1 or (model_pos - 1) > len(line):
-            print('Processing line: %s failed' % line)
             continue
         left_quote = line[model_pos - 1]
         rquote_idx = line.rfind(left_quote)
diff --git a/modelscope/version.py b/modelscope/version.py
index fb0e01f37..031a86b45 100644
--- a/modelscope/version.py
+++ b/modelscope/version.py
@@ -1,5 +1,5 @@
 # Make sure to modify __release_datetime__ to release time when making official release.
-__version__ = '1.9.4'
+__version__ = '2.0.0'
 # default release datetime for branches under active development is set
 # to be a time far-far-away-into-the-future
 __release_datetime__ = '2099-09-06 00:00:00'
diff --git a/requirements.txt b/requirements.txt
index 0832e6ab4..daa2b59c1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1 @@
--r requirements/framework.txt
+-r requirements/hub.txt
diff --git a/requirements/audio.txt b/requirements/audio.txt
index 331c334b2..88e469cea 100644
--- a/requirements/audio.txt
+++ b/requirements/audio.txt
@@ -2,3 +2,4 @@
 -r audio/audio_kws.txt
 -r audio/audio_signal.txt
 -r audio/audio_tts.txt
+-r audio/audio_codec.txt
diff --git a/requirements/audio/audio_asr.txt b/requirements/audio/audio_asr.txt
index f7b1eaea9..a63614fe5 100644
--- a/requirements/audio/audio_asr.txt
+++ b/requirements/audio/audio_asr.txt
@@ -1 +1 @@
-funasr>=0.6.5
+funasr>=1.0.0
diff --git a/requirements/audio/audio_codec.txt b/requirements/audio/audio_codec.txt
new file mode 100644
index 000000000..c7ac8b2bd
--- /dev/null
+++ b/requirements/audio/audio_codec.txt
@@ -0,0 +1 @@
+funcodec>=0.2.0
diff --git a/requirements/audio/audio_signal.txt b/requirements/audio/audio_signal.txt
index 023fbbdf8..65f1ec61b 100644
--- a/requirements/audio/audio_signal.txt
+++ b/requirements/audio/audio_signal.txt
@@ -1,6 +1,6 @@
 hdbscan
 hyperpyyaml
-librosa==0.9.2
+librosa==0.10.1
 MinDAEC
 mir_eval>=0.7
 rotary_embedding_torch>=0.1.5
diff --git a/requirements/audio/audio_tts.txt b/requirements/audio/audio_tts.txt
index 8b33f02f5..5cff1b289 100644
--- a/requirements/audio/audio_tts.txt
+++ b/requirements/audio/audio_tts.txt
@@ -3,7 +3,7 @@ greenlet>=1.1.2
 inflect
 jedi>=0.18.1
 kantts
-librosa==0.9.2
+librosa==0.10.1
 lxml
 matplotlib
 msgpack>=1.0.4
diff --git a/requirements/cv.txt b/requirements/cv.txt
index ee9f55820..842cded25 100644
--- a/requirements/cv.txt
+++ b/requirements/cv.txt
@@ -17,21 +17,22 @@ ffmpeg>=1.4
 ffmpeg-python>=0.2.0
 ftfy
 fvcore
-healpy
+# remove for windows support
+# healpy
 imageio>=2.9.0
 imageio-ffmpeg>=0.4.2
 imgaug>=0.4.0
 kornia>=0.5.0
-lap
 lmdb
 lpips
+matplotlib>=3.8.0
 ml_collections
 mmcls>=0.21.0
 mmdet>=2.25.0,<=2.28.2
 # mmdet3d-1.0.0rc6 remove networkx and numba version restriction
 mmdet3d==1.0.0a1
 mmsegmentation<=0.30.0
-moviepy>=1.0.3
+moviepy==1.0.3
 nerfacc==0.2.2
 networkx
 numba
@@ -44,15 +45,16 @@ opencv-python
 paint_ldm
 pandas
 panopticapi
+Pillow>=6.2.0
 plyfile>=0.7.4
 psutil
 pyclipper
-PyMCubes
+PyMCubes<=0.1.4
 pytorch-lightning
 regex
 # <0.20.0 for compatible python3.7 python3.8
-scikit-image>=0.19.3,<0.20.0
-scikit-learn>=0.20.1
+scikit-image
+scikit-learn
 shapely
 shotdetect_scenedetect_lgss>=0.0.4
 smplx
diff --git a/requirements/datasets.txt b/requirements/datasets.txt
new file mode 100644
index 000000000..d20154e13
--- /dev/null
+++ b/requirements/datasets.txt
@@ -0,0 +1,13 @@
+addict
+attrs
+datasets>=3.0.0,<=3.0.1
+einops
+oss2
+Pillow
+python-dateutil>=2.1
+scipy
+# latest version has some compatible issue.
+setuptools==69.5.1
+simplejson>=3.3.0
+sortedcontainers>=1.5.9
+urllib3>=1.26
diff --git a/requirements/framework.txt b/requirements/framework.txt
index 83e69a004..9aa4c0451 100644
--- a/requirements/framework.txt
+++ b/requirements/framework.txt
@@ -1,22 +1,14 @@
 addict
 attrs
-datasets>=2.8.0,<=2.13.0
+datasets>=3.0.0,<=3.0.1
 einops
-filelock>=3.3.0
-gast>=0.2.2
-numpy
 oss2
-pandas
-Pillow>=6.2.0
-# pyarrow 9.0.0 introduced event_loop core dump
-pyarrow>=6.0.0,!=9.0.0
+Pillow
 python-dateutil>=2.1
-pyyaml
-requests>=2.25
 scipy
-setuptools
+# latest version has some compatible issue.
+setuptools==69.5.1
 simplejson>=3.3.0
 sortedcontainers>=1.5.9
-tqdm>=4.64.0
+transformers
 urllib3>=1.26
-yapf
diff --git a/requirements/hub.txt b/requirements/hub.txt
new file mode 100644
index 000000000..ce8c7562d
--- /dev/null
+++ b/requirements/hub.txt
@@ -0,0 +1,3 @@
+requests>=2.25
+tqdm>=4.64.0
+urllib3>=1.26
diff --git a/requirements/multi-modal.txt b/requirements/multi-modal.txt
index 59415bb09..6c974b56d 100644
--- a/requirements/multi-modal.txt
+++ b/requirements/multi-modal.txt
@@ -1,10 +1,11 @@
 accelerate
 cloudpickle
 decord>=0.6.0
-diffusers>=0.19.0
-fairseq
+diffusers>=0.25.0
+# 0.12.1 has issue of No such file or directory: 'fairseq/version.txt'
+fairseq==0.12.2
 ftfy>=6.0.3
-librosa==0.9.2
+librosa==0.10.1
 opencv-python
 pycocoevalcap>=1.2
 pycocotools>=2.0.4
diff --git a/requirements/nlp.txt b/requirements/nlp.txt
index f69f869b8..2bff6747f 100644
--- a/requirements/nlp.txt
+++ b/requirements/nlp.txt
@@ -17,7 +17,7 @@ sacremoses>=0.0.41
 scikit_learn
 sentencepiece
 seqeval
-spacy>=2.3.5
+spacy>=2.3.5,<=3.7.0
 stanza
 subword_nmt>=0.3.8
 termcolor
diff --git a/requirements/server.txt b/requirements/server.txt
new file mode 100644
index 000000000..ffe131c0e
--- /dev/null
+++ b/requirements/server.txt
@@ -0,0 +1,3 @@
+fastapi
+sse-starlette
+uvicorn
diff --git a/requirements/tensorflow1x.txt b/requirements/tensorflow1x.txt
index 5d6806520..c808f28fc 100644
--- a/requirements/tensorflow1x.txt
+++ b/requirements/tensorflow1x.txt
@@ -1 +1 @@
-numpy<1.20.0
+numpy<=1.18.5
diff --git a/setup.py b/setup.py
index dbac6e779..76a6a7bfc 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,6 @@
 import subprocess
 from setuptools import find_packages, setup
 
-from modelscope.utils.ast_utils import generate_ast_template
 from modelscope.utils.constant import Fields
 
 
@@ -171,6 +170,7 @@ def pack_resource():
 
 if __name__ == '__main__':
     # write_version_py()
+    from modelscope.utils.ast_utils import generate_ast_template
     generate_ast_template()
     pack_resource()
     os.chdir('package')
@@ -192,6 +192,13 @@ def pack_resource():
         filed_name = f'audio_{subfiled}'
         extra_requires[filed_name], _ = parse_requirements(
             f'requirements/audio/{filed_name}.txt')
+    framework_requires = extra_requires['framework']
+    # add framework dependencies to every field
+    for field, requires in extra_requires.items():
+        if field not in [
+                'server', 'framework', 'hub', 'datasets'
+        ]:  # server need install model's field dependencies before.
+            extra_requires[field] = framework_requires + extra_requires[field]
     extra_requires['all'] = all_requires
 
     setup(
diff --git a/tests/cli/test_download_cmd.py b/tests/cli/test_download_cmd.py
index 6059fa123..aa58c568c 100644
--- a/tests/cli/test_download_cmd.py
+++ b/tests/cli/test_download_cmd.py
@@ -53,12 +53,22 @@ def tearDown(self):
         super().tearDown()
 
     def test_download(self):
+        cmd = f'python -m modelscope.cli.cli download --model {self.model_id}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+
+    def test_download_with_position_arg(self):
         cmd = f'python -m modelscope.cli.cli download {self.model_id}'
         stat, output = subprocess.getstatusoutput(cmd)
         self.assertEqual(stat, 0)
 
+    def test_download_file(self):
+        cmd = f'python -m modelscope.cli.cli download --model {self.model_id} {download_model_file_name}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+
     def test_download_with_cache(self):
-        cmd = f'python -m modelscope.cli.cli download {self.model_id} --cache_dir {self.tmp_dir}'
+        cmd = f'python -m modelscope.cli.cli download --model {self.model_id} --cache_dir {self.tmp_dir}'
         stat, output = subprocess.getstatusoutput(cmd)
         if stat != 0:
             print(output)
@@ -68,7 +78,7 @@ def test_download_with_cache(self):
                 f'{self.tmp_dir}/{self.model_id}/{download_model_file_name}'))
 
     def test_download_with_revision(self):
-        cmd = f'python -m modelscope.cli.cli download {self.model_id} --revision {self.revision}'
+        cmd = f'python -m modelscope.cli.cli download --model {self.model_id} --revision {self.revision}'
         stat, output = subprocess.getstatusoutput(cmd)
         if stat != 0:
             print(output)
diff --git a/tests/cli/test_llamfafile_cmd.py b/tests/cli/test_llamfafile_cmd.py
new file mode 100644
index 000000000..d0ff75742
--- /dev/null
+++ b/tests/cli/test_llamfafile_cmd.py
@@ -0,0 +1,93 @@
+import subprocess
+import unittest
+
+
+class LlamafileCMDTest(unittest.TestCase):
+
+    def setUp(self):
+        self.model_id = 'llamafile-club/mock-llamafile-repo'
+        self.invalid_model_id = 'llamafile-club/mock-no-valid-llamafile-repo'
+        self.cmd = 'llamafile'
+
+    def test_basic(self):
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        # default accuracy is 'q4_k_m'
+        self.assertTrue(
+            'llamafile matching criteria found: [My-Model-14B-Q4_K_M.llamafile]'
+            in output)
+        self.assertTrue('Launching model with llamafile' in output)
+
+    def test_given_accuracy(self):
+        accuracy = 'q8_0'
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --accuracy {accuracy}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        self.assertTrue(
+            'llamafile matching criteria found: [My-Model-14B-q8_0.llamafile]'
+            in output)
+        self.assertTrue('Launching model with llamafile' in output)
+
+        accuracy = 'Q2_K'
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --accuracy {accuracy}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        self.assertTrue(
+            'llamafile matching criteria found: [My-Model-14B-Q2_K.llamafile]'
+            in output)
+        self.assertTrue('Launching model with llamafile' in output)
+
+        accuracy = 'q2_k'
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --accuracy {accuracy}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        self.assertTrue(
+            'llamafile matching criteria found: [My-Model-14B-Q2_K.llamafile]'
+            in output)
+        self.assertTrue('Launching model with llamafile' in output)
+
+    def test_given_file(self):
+        file = 'My-Model-14B-FP16.llamafile'
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --file {file}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        self.assertTrue(
+            'llamafile matching criteria found: [My-Model-14B-FP16.llamafile]'
+            in output)
+        self.assertTrue('Launching model with llamafile' in output)
+
+    def test_given_both_accuracy_and_file(self):
+        accuracy = 'q8_0'
+        file = 'My-Model-14B-FP16.llamafile'
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --file {file} --accuracy {accuracy}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        # cannot provide accuracy and file at the same time
+        self.assertNotEquals(stat, 0)
+
+    def test_no_match_llamafile(self):
+        accuracy = 'not-exist'
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --accuracy {accuracy}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        self.assertTrue(
+            'No matched llamafile found in repo, choosing the first llamafile in repo'
+            in output)
+        self.assertTrue('Launching model with llamafile' in output)
+
+    def test_invalid_repo(self):
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.invalid_model_id}'
+        stat, output = subprocess.getstatusoutput(cmd)
+        print(output)
+        self.assertNotEquals(stat, 0)
+        self.assertTrue('Cannot locate a valid llamafile in repo' in output)
+
+    def test_no_execution(self):
+        cmd = f'python -m modelscope.cli.cli {self.cmd} --model {self.model_id} --launch False'
+        stat, output = subprocess.getstatusoutput(cmd)
+        self.assertEqual(stat, 0)
+        self.assertTrue(
+            'llamafile matching criteria found: [My-Model-14B-Q4_K_M.llamafile]'
+            in output)
+        self.assertTrue(
+            'No Launching. Llamafile model downloaded to' in output)
diff --git a/tests/cli/test_modelcard_cmd.py b/tests/cli/test_modelcard_cmd.py
index 3484895b3..6dff2fe33 100644
--- a/tests/cli/test_modelcard_cmd.py
+++ b/tests/cli/test_modelcard_cmd.py
@@ -9,6 +9,8 @@
 from modelscope.hub.api import HubApi
 from modelscope.utils.test_utils import TEST_ACCESS_TOKEN1, TEST_MODEL_ORG
 
+os.environ['MKL_THREADING_LAYER'] = 'GNU'
+
 
 class ModelUploadCMDTest(unittest.TestCase):
 
diff --git a/tests/export/test_export_face_detection_scrfd.py b/tests/export/test_export_face_detection_scrfd.py
index cb4543610..ceec94b0c 100644
--- a/tests/export/test_export_face_detection_scrfd.py
+++ b/tests/export/test_export_face_detection_scrfd.py
@@ -24,7 +24,9 @@ def setUp(self):
             os.makedirs(self.tmp_dir)
         self.model_id = 'damo/cv_resnet_facedetection_scrfd10gkps'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(
+        test_level() >= 1,
+        'Skip for export issue of not <protocol "torch.Tensor"> or tuple ')
     def test_export_face_detection_scrfd(self):
         model = Model.from_pretrained(self.model_id)
         print(Exporter.from_model(model).export_onnx(output_dir=self.tmp_dir))
diff --git a/tests/hub/test_download_dataset_file.py b/tests/hub/test_download_dataset_file.py
new file mode 100644
index 000000000..8e8712f5f
--- /dev/null
+++ b/tests/hub/test_download_dataset_file.py
@@ -0,0 +1,169 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import os
+import tempfile
+import time
+import unittest
+
+from modelscope.hub.file_download import dataset_file_download
+from modelscope.hub.snapshot_download import dataset_snapshot_download
+
+
+class DownloadDatasetTest(unittest.TestCase):
+
+    def setUp(self):
+        pass
+
+    def test_dataset_file_download(self):
+        dataset_id = 'citest/test_dataset_download'
+        file_path = 'open_qa.jsonl'
+        deep_file_path = '111/222/333/shijian.jpeg'
+        start_time = time.time()
+
+        #  test download to cache dir.
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            #  first download to cache.
+            cache_file_path = dataset_file_download(
+                dataset_id=dataset_id,
+                file_path=file_path,
+                cache_dir=temp_cache_dir)
+            file_modify_time = os.path.getmtime(cache_file_path)
+            print(cache_file_path)
+            assert cache_file_path == os.path.join(temp_cache_dir, dataset_id,
+                                                   file_path)
+            assert file_modify_time > start_time
+            # download again, will get cached file.
+            cache_file_path = dataset_file_download(
+                dataset_id=dataset_id,
+                file_path=file_path,
+                cache_dir=temp_cache_dir)
+            file_modify_time2 = os.path.getmtime(cache_file_path)
+            assert file_modify_time == file_modify_time2
+
+            deep_cache_file_path = dataset_file_download(
+                dataset_id=dataset_id,
+                file_path=deep_file_path,
+                cache_dir=temp_cache_dir)
+            deep_file_cath_path = os.path.join(temp_cache_dir, dataset_id,
+                                               deep_file_path)
+            assert deep_cache_file_path == deep_file_cath_path
+            os.path.exists(deep_cache_file_path)
+
+        # test download to local dir
+        with tempfile.TemporaryDirectory() as temp_local_dir:
+            #  first download to cache.
+            cache_file_path = dataset_file_download(
+                dataset_id=dataset_id,
+                file_path=file_path,
+                local_dir=temp_local_dir)
+            assert cache_file_path == os.path.join(temp_local_dir, file_path)
+            file_modify_time = os.path.getmtime(cache_file_path)
+            assert file_modify_time > start_time
+            # download again, will get cached file.
+            cache_file_path = dataset_file_download(
+                dataset_id=dataset_id,
+                file_path=file_path,
+                local_dir=temp_local_dir)
+            file_modify_time2 = os.path.getmtime(cache_file_path)
+            assert file_modify_time == file_modify_time2
+
+    def test_dataset_snapshot_download(self):
+        dataset_id = 'citest/test_dataset_download'
+        file_path = 'open_qa.jsonl'
+        deep_file_path = '111/222/333/shijian.jpeg'
+        start_time = time.time()
+
+        #  test download to cache dir.
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            #  first download to cache.
+            dataset_cache_path = dataset_snapshot_download(
+                dataset_id=dataset_id, cache_dir=temp_cache_dir)
+            file_modify_time = os.path.getmtime(
+                os.path.join(dataset_cache_path, file_path))
+            assert dataset_cache_path == os.path.join(temp_cache_dir,
+                                                      dataset_id)
+            assert file_modify_time > start_time
+            assert os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, deep_file_path))
+
+            # download again, will get cached file.
+            dataset_cache_path2 = dataset_snapshot_download(
+                dataset_id=dataset_id, cache_dir=temp_cache_dir)
+            file_modify_time2 = os.path.getmtime(
+                os.path.join(dataset_cache_path2, file_path))
+            assert file_modify_time == file_modify_time2
+
+        # test download to local dir
+        with tempfile.TemporaryDirectory() as temp_local_dir:
+            #  first download to cache.
+            dataset_cache_path = dataset_snapshot_download(
+                dataset_id=dataset_id, local_dir=temp_local_dir)
+            # root path is temp_local_dir, file download to local_dir
+            assert dataset_cache_path == temp_local_dir
+            file_modify_time = os.path.getmtime(
+                os.path.join(dataset_cache_path, file_path))
+            assert file_modify_time > start_time
+            # download again, will get cached file.
+            dataset_cache_path2 = dataset_snapshot_download(
+                dataset_id=dataset_id, local_dir=temp_local_dir)
+            file_modify_time2 = os.path.getmtime(
+                os.path.join(dataset_cache_path2, file_path))
+            assert file_modify_time == file_modify_time2
+
+        #  test download with wild pattern, ignore_file_pattern
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            #  first download to cache.
+            dataset_cache_path = dataset_snapshot_download(
+                dataset_id=dataset_id,
+                cache_dir=temp_cache_dir,
+                ignore_file_pattern=['*.jpeg', '.jpg'])
+            assert dataset_cache_path == os.path.join(temp_cache_dir,
+                                                      dataset_id)
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, deep_file_path))
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, '111/shijian.jpeg'))
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id,
+                             '111/222/shijian.jpeg'))
+            assert os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, file_path))
+
+        #  test download with wild pattern, allow_file_pattern
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            #  first download to cache.
+            dataset_cache_path = dataset_snapshot_download(
+                dataset_id=dataset_id,
+                cache_dir=temp_cache_dir,
+                allow_file_pattern='*.jpeg')
+            assert dataset_cache_path == os.path.join(temp_cache_dir,
+                                                      dataset_id)
+            assert os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, deep_file_path))
+            assert os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, '111/shijian.jpeg'))
+            assert os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id,
+                             '111/222/shijian.jpeg'))
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, file_path))
+
+        # test download with wild pattern, allow_file_pattern and ignore file pattern.
+        with tempfile.TemporaryDirectory() as temp_cache_dir:
+            #  first download to cache.
+            dataset_cache_path = dataset_snapshot_download(
+                dataset_id=dataset_id,
+                cache_dir=temp_cache_dir,
+                ignore_file_pattern='*.jpeg',
+                allow_file_pattern='*.xxx')
+            assert dataset_cache_path == os.path.join(temp_cache_dir,
+                                                      dataset_id)
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, deep_file_path))
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, '111/shijian.jpeg'))
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id,
+                             '111/222/shijian.jpeg'))
+            assert not os.path.exists(
+                os.path.join(temp_cache_dir, dataset_id, file_path))
diff --git a/tests/hub/test_hub_empty_file.py b/tests/hub/test_hub_empty_file.py
new file mode 100644
index 000000000..b73b1a66c
--- /dev/null
+++ b/tests/hub/test_hub_empty_file.py
@@ -0,0 +1,31 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os.path
+import shutil
+import tempfile
+import unittest
+
+from modelscope import snapshot_download
+
+
+class HubEmptyFile(unittest.TestCase):
+
+    def setUp(self):
+        temporary_dir = tempfile.mkdtemp()
+        self.work_dir = temporary_dir
+
+    def tearDown(self):
+        shutil.rmtree(self.work_dir, ignore_errors=True)
+
+    def test_download_empty_file(self):
+        model_dir = snapshot_download(
+            'tastelikefeet/test_empty_download', cache_dir=self.work_dir)
+        self.assertTrue(model_dir is not None)
+        self.assertTrue(os.path.exists(os.path.join(model_dir, '1.txt')))
+        self.assertTrue(
+            os.path.exists(os.path.join(model_dir, 'configuration.json')))
+        self.assertTrue(os.path.exists(os.path.join(model_dir, 'init.py')))
+        self.assertTrue(os.path.exists(os.path.join(model_dir, 'README.md')))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/hub/test_hub_examples.py b/tests/hub/test_hub_examples.py
index d1f7594e9..ab3cff1d0 100644
--- a/tests/hub/test_hub_examples.py
+++ b/tests/hub/test_hub_examples.py
@@ -6,7 +6,7 @@
 from modelscope.utils.hub import create_model_if_not_exist
 
 # note this is temporary before official account management is ready
-YOUR_ACCESS_TOKEN = 'token'
+YOUR_ACCESS_TOKEN = 'Get SDK token from https://www.modelscope.cn/my/myaccesstoken'
 
 
 class HubExampleTest(unittest.TestCase):
@@ -18,10 +18,10 @@ def setUp(self):
     @unittest.skip('to be used for local test only')
     def test_example_model_creation(self):
         # ATTENTION:change to proper model names before use
-        model_name = 'cv_unet_person-image-cartoon_compound-models'
-        model_chinese_name = '达摩卡通化模型'
-        model_org = 'damo'
-        model_id = '%s/%s' % (model_org, model_name)
+        model_name = 'model-name'
+        model_chinese_name = '我的测试模型'
+        model_owner = 'iic'
+        model_id = '%s/%s' % (model_owner, model_name)
         created = create_model_if_not_exist(self.api, model_id,
                                             model_chinese_name)
         if not created:
diff --git a/tests/hub/test_hub_operation.py b/tests/hub/test_hub_operation.py
index a22aaa648..252dce819 100644
--- a/tests/hub/test_hub_operation.py
+++ b/tests/hub/test_hub_operation.py
@@ -1,8 +1,10 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import os
+import shutil
 import tempfile
 import unittest
 import uuid
+from pathlib import Path
 from shutil import rmtree
 
 import requests
@@ -13,6 +15,7 @@
 from modelscope.hub.repository import Repository
 from modelscope.hub.snapshot_download import snapshot_download
 from modelscope.utils.constant import ModelFile
+from modelscope.utils.file_utils import get_model_cache_dir
 from modelscope.utils.test_utils import (TEST_ACCESS_TOKEN1,
                                          TEST_MODEL_CHINESE_NAME,
                                          TEST_MODEL_ORG)
@@ -148,6 +151,51 @@ def test_list_model(self):
         data = self.api.list_models(TEST_MODEL_ORG)
         assert len(data['Models']) >= 1
 
+    def test_snapshot_download_location(self):
+        self.prepare_case()
+        snapshot_download_path = snapshot_download(
+            model_id=self.model_id, revision=self.revision)
+        assert os.path.exists(snapshot_download_path)
+        assert '/hub/' in snapshot_download_path
+        print(snapshot_download_path)
+        shutil.rmtree(snapshot_download_path)
+        # download with cache_dir
+        cache_dir = '/tmp/snapshot_download_cache_test'
+        snapshot_download_path = snapshot_download(
+            self.model_id, revision=self.revision, cache_dir=cache_dir)
+        expect_path = os.path.join(cache_dir, self.model_id)
+        assert snapshot_download_path == expect_path
+        assert os.path.exists(
+            os.path.join(snapshot_download_path, ModelFile.README))
+        shutil.rmtree(cache_dir)
+        # download with local_dir
+        local_dir = '/tmp/snapshot_download_local_dir'
+        snapshot_download_path = snapshot_download(
+            self.model_id, revision=self.revision, local_dir=local_dir)
+        assert snapshot_download_path == local_dir
+        assert os.path.exists(os.path.join(local_dir, ModelFile.README))
+        shutil.rmtree(local_dir)
+        # download with local_dir and cache dir, with local first.
+        local_dir = '/tmp/snapshot_download_local_dir'
+        snapshot_download_path = snapshot_download(
+            self.model_id,
+            revision=self.revision,
+            cache_dir=cache_dir,
+            local_dir=local_dir)
+        assert snapshot_download_path == local_dir
+        assert os.path.exists(os.path.join(local_dir, ModelFile.README))
+
+    def test_snapshot_download_ignore_file_pattern_test(self):
+        self.prepare_case()
+        snapshot_download_path = snapshot_download(
+            model_id=self.model_id,
+            revision=self.revision,
+            ignore_file_pattern=['.*.pt', '.*.safetensors', '.*.bin'])
+        for _, _, files in os.walk(snapshot_download_path):
+            for file in files:
+                assert not file.endswith('pt') and not file.endswith(
+                    'safetensors') and not file.endswith('bin')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/hub/test_hub_retry.py b/tests/hub/test_hub_retry.py
index e294cb687..beab2f1ea 100644
--- a/tests/hub/test_hub_retry.py
+++ b/tests/hub/test_hub_retry.py
@@ -9,10 +9,10 @@
 from urllib3.exceptions import MaxRetryError
 
 from modelscope.hub.api import HubApi
-from modelscope.hub.file_download import http_get_file
+from modelscope.hub.file_download import http_get_model_file
 
 
-class HubOperationTest(unittest.TestCase):
+class HubRetryTest(unittest.TestCase):
 
     def setUp(self):
         self.api = HubApi()
@@ -56,6 +56,8 @@ def get_content(p):
         rsp.msg = HTTPMessage()
         rsp.read = get_content
         rsp.chunked = False
+        rsp.length_remaining = 0
+        rsp.headers = {}
         # retry 2 times and success.
         getconn_mock.return_value.getresponse.side_effect = [
             Mock(status=500, msg=HTTPMessage()),
@@ -88,16 +90,18 @@ def get_content(content_length):
         success_rsp = HTTPResponse(getconn_mock)
         success_rsp.status = 200
         success_rsp.msg = HTTPMessage()
-        success_rsp.msg.add_header('Content-Length', '2957783')
         success_rsp.read = get_content
         success_rsp.chunked = True
+        success_rsp.length_remaining = 0
+        success_rsp.headers = {'Content-Length': '2957783'}
 
         failed_rsp = HTTPResponse(getconn_mock)
         failed_rsp.status = 502
         failed_rsp.msg = HTTPMessage()
-        failed_rsp.msg.add_header('Content-Length', '2957783')
         failed_rsp.read = get_content
         failed_rsp.chunked = True
+        success_rsp.length_remaining = 2957783
+        success_rsp.headers = {'Content-Length': '2957783'}
 
         # retry 5 times and success.
         getconn_mock.return_value.getresponse.side_effect = [
@@ -109,10 +113,11 @@ def get_content(content_length):
             success_rsp,
         ]
         url = 'http://www.modelscope.cn/api/v1/models/%s' % test_file_name
-        http_get_file(
+        http_get_model_file(
             url=url,
             local_dir='./',
             file_name=test_file_name,
+            file_size=2957783,
             headers={},
             cookies=None)
 
@@ -150,14 +155,15 @@ def get_content(content_length):
         ]
         url = 'http://www.modelscope.cn/api/v1/models/%s' % test_file_name
         with self.assertRaises(MaxRetryError):
-            http_get_file(
+            http_get_model_file(
                 url=url,
                 local_dir='./',
                 file_name=test_file_name,
+                file_size=2957783,
                 headers={},
                 cookies=None)
 
-        assert not os.path.exists('./%s' % test_file_name)
+        assert os.stat('./%s' % test_file_name).st_size == 0
 
 
 if __name__ == '__main__':
diff --git a/tests/hub/test_hub_revision_release_mode.py b/tests/hub/test_hub_revision_release_mode.py
index 3b8416db2..823e1d5d3 100644
--- a/tests/hub/test_hub_revision_release_mode.py
+++ b/tests/hub/test_hub_revision_release_mode.py
@@ -93,6 +93,7 @@ def test_dev_mode_specify_branch(self):
             self.prepare_repo_data()  # no tag, default get master
             branch_name = 'test'
             self.add_new_file_and_branch_to_repo(branch_name)
+            time.sleep(5)
             with tempfile.TemporaryDirectory() as temp_cache_dir:
                 snapshot_path = snapshot_download(
                     self.model_id,
diff --git a/tests/hub/test_hub_upload.py b/tests/hub/test_hub_upload.py
index 2a66cb8ba..8a67a9de3 100644
--- a/tests/hub/test_hub_upload.py
+++ b/tests/hub/test_hub_upload.py
@@ -47,6 +47,12 @@ def tearDown(self):
         except Exception:
             pass
 
+    def test_repo_exist(self):
+        res = self.api.repo_exists('Qwen/Qwen2.5-7B-Instruct')
+        self.assertTrue(res)
+        res = self.api.repo_exists('Qwen/not-a-repo')
+        self.assertFalse(res)
+
     def test_upload_exits_repo_master(self):
         logger.info('basic test for upload!')
         self.api.login(TEST_ACCESS_TOKEN1)
diff --git a/tests/json_call_test.py b/tests/json_call_test.py
index 7073a90da..a7c7c91a5 100644
--- a/tests/json_call_test.py
+++ b/tests/json_call_test.py
@@ -4,10 +4,10 @@
 
 from modelscope.hub.api import HubApi
 from modelscope.hub.file_download import model_file_download
-from modelscope.hub.utils.utils import get_cache_dir
 from modelscope.pipelines import pipeline
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
+from modelscope.utils.file_utils import get_model_cache_dir
 from modelscope.utils.input_output import (
     call_pipeline_with_json, get_pipeline_information_by_pipeline,
     get_task_input_examples, pipeline_output_to_service_base64_output)
@@ -20,9 +20,8 @@ def __init__(self):
 
     def test_single(self, model_id: str, model_revision=None):
         # get model_revision & task info
-        cache_root = get_cache_dir()
-        configuration_file = os.path.join(cache_root, model_id,
-                                          ModelFile.CONFIGURATION)
+        configuration_file = os.path.join(
+            get_model_cache_dir(model_id), ModelFile.CONFIGURATION)
         if not model_revision:
             model_revision = self.api.list_model_revisions(
                 model_id=model_id)[0]
@@ -40,7 +39,7 @@ def test_single(self, model_id: str, model_revision=None):
             task=task,
             model=model_id,
             model_revision=model_revision,
-            llm_first=True)
+            external_engine_for_llm=True)
         pipeline_info = get_pipeline_information_by_pipeline(ppl)
 
         # call pipeline
diff --git a/tests/msdatasets/test_general_datasets.py b/tests/msdatasets/test_general_datasets.py
new file mode 100644
index 000000000..1fd96f8f9
--- /dev/null
+++ b/tests/msdatasets/test_general_datasets.py
@@ -0,0 +1,149 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope import MsDataset
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+TEST_INNER_LEVEL = 1
+
+
+class GeneralMsDatasetTest(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_return_dataset_info_only(self):
+        ds = MsDataset.load(
+            'wangxingjun778/aya_dataset_mini', dataset_info_only=True)
+        logger.info(f'>>output of test_return_dataset_info_only:\n {ds}')
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_inner_fashion_mnist(self):
+        # inner means the dataset is on the test.modelscope.cn environment
+        ds = MsDataset.load(
+            'wangxingjun778/ms_test_0308_py',
+            subset_name='fashion_mnist',
+            split='train')
+        logger.info(
+            f'>>output of test_inner_fashion_mnist:\n {next(iter(ds))}')
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_inner_clue(self):
+        ds = MsDataset.load(
+            'wangxingjun778/clue', subset_name='afqmc', split='train')
+        logger.info(f'>>output of test_inner_clue:\n {next(iter(ds))}')
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_inner_cats_and_dogs_mini(self):
+        ds = MsDataset.load('wangxingjun778/cats_and_dogs_mini', split='train')
+        logger.info(
+            f'>>output of test_inner_cats_and_dogs_mini:\n {next(iter(ds))}')
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_inner_aya_dataset_mini(self):
+        # Dataset Format:
+        # data/train-xxx-of-xxx.parquet; data/test-xxx-of-xxx.parquet
+        # demographics/train-xxx-of-xxx.parquet
+
+        ds = MsDataset.load('wangxingjun778/aya_dataset_mini', split='train')
+        logger.info(
+            f'>>output of test_inner_aya_dataset_mini:\n {next(iter(ds))}')
+
+        ds = MsDataset.load(
+            'wangxingjun778/aya_dataset_mini', subset_name='demographics')
+        assert next(iter(ds['train']))
+        logger.info(
+            f">>output of test_inner_aya_dataset_mini:\n {next(iter(ds['train']))}"
+        )
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_inner_no_standard_imgs(self):
+        infos = MsDataset.load(
+            'wangxingjun778/png_jpg_txt_test', dataset_info_only=True)
+        assert infos['default']
+
+        ds = MsDataset.load('wangxingjun778/png_jpg_txt_test', split='train')
+        logger.info(
+            f'>>>output of test_inner_no_standard_imgs: \n{next(iter(ds))}')
+        assert next(iter(ds))
+
+    @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
+    def test_inner_speech_yinpin(self):
+        ds = MsDataset.load('wangxingjun778/hf_lj_speech_yinpin_test')
+        logger.info(ds)
+        assert next(iter(ds))
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_inner_yuancheng_picture(self):
+        ds = MsDataset.load(
+            'wangxingjun778/yuancheng_picture',
+            subset_name='remote_images',
+            split='train')
+        logger.info(next(iter(ds)))
+        assert next(iter(ds))
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_youku_mplug_dataset(self):
+        # To test the Youku-AliceMind dataset with new sdk version
+        ds = MsDataset.load(
+            'modelscope/Youku-AliceMind',
+            subset_name='classification',
+            split='validation',  # Options: train, test, validation
+            use_streaming=True)
+
+        logger.info(next(iter(ds)))
+        data_sample = next(iter(ds))
+
+        assert data_sample['video_id'][0]
+        assert os.path.exists(data_sample['video_id:FILE'][0])
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_local_py_script(self):
+        # Download the dataset files to temp directory
+        from tempfile import TemporaryDirectory
+        py_script_url = 'https://modelscope.cn/datasets/wangxingjun778/glue_test/resolve/master/glue_test.py'
+        with TemporaryDirectory() as tmp_dir:
+            os.makedirs(tmp_dir, exist_ok=True)
+            os.system(f'wget -P {tmp_dir} {py_script_url}')
+            py_script_file = os.path.join(tmp_dir, 'glue_test.py')
+            assert os.path.exists(py_script_file), f'File not found: {py_script_file}, ' \
+                                                   f'please check the url: {py_script_url}'
+
+            # Load the dataset
+            ds = MsDataset.load(
+                py_script_file, subset_name='cola', split='train')
+            sample = next(iter(ds))
+            logger.info(f'>>output of test_local_py_script:\n {sample}')
+            assert sample
+
+    @unittest.skipUnless(test_level() >= TEST_INNER_LEVEL,
+                         'skip test in current test level')
+    def test_local_img_folder(self):
+        # Download the dataset files to temp directory
+        from tempfile import TemporaryDirectory
+        img_url = 'https://modelscope.cn/datasets/wangxingjun778/test_img_dataset/resolve/master/data/train/' \
+                  '000000573258.jpg'
+        with TemporaryDirectory() as tmp_dir:
+            os.makedirs(tmp_dir, exist_ok=True)
+            os.system(f'wget -P {tmp_dir} {img_url}')
+
+            # Load the local image folder
+            ds = MsDataset.load('imagefolder', data_dir=tmp_dir)
+            sample = next(iter(ds))
+            logger.info(f'>>output of test_local_img_folder:\n {sample}')
+            assert sample
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/msdatasets/test_ms_dataset.py b/tests/msdatasets/test_ms_dataset.py
index 83ffd3f85..dfbdfa8c0 100644
--- a/tests/msdatasets/test_ms_dataset.py
+++ b/tests/msdatasets/test_ms_dataset.py
@@ -169,17 +169,6 @@ def test_to_dataset_asr(self):
             'speech_asr_aishell1_trainsets', namespace='speech_asr')
         print(next(iter(ms_ds_asr['train'])))
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
-    @require_torch
-    def test_to_torch_dataset_img(self):
-        ms_image_train = MsDataset.load(
-            'fixtures_image_utils', namespace='damotest', split='test')
-        pt_dataset = ms_image_train.to_torch_dataset(
-            preprocessors=ImgPreprocessor(image_path='file'))
-        import torch
-        dataloader = torch.utils.data.DataLoader(pt_dataset, batch_size=5)
-        print(next(iter(dataloader)))
-
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     @require_tf
     def test_to_tf_dataset_img(self):
@@ -229,7 +218,7 @@ def test_streaming_load_from_hf(self):
         print(data_example)
         assert data_example.values()
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skipUnless(test_level() >= 3, 'skip test in current test level')
     def test_streaming_load_img_object(self):
         """Test case for iterating PIL object."""
         from PIL.PngImagePlugin import PngImageFile
@@ -238,7 +227,7 @@ def test_streaming_load_img_object(self):
             subset_name='default',
             namespace='huizheng',
             split='train',
-            use_streaming=True)
+            use_streaming=False)
         data_example = next(iter(dataset))
         print(data_example)
         assert data_example.values()
@@ -247,7 +236,8 @@ def test_streaming_load_img_object(self):
     def test_to_ms_dataset(self):
         """Test case for converting huggingface dataset to `MsDataset` instance."""
         from datasets.load import load_dataset
-        hf_dataset = load_dataset('beans', split='train', streaming=True)
+        hf_dataset = load_dataset(
+            'AI-Lab-Makerere/beans', split='train', streaming=True)
         ms_dataset = MsDataset.to_ms_dataset(hf_dataset)
         data_example = next(iter(ms_dataset))
         print(data_example)
diff --git a/tests/msdatasets/test_stream_load.py b/tests/msdatasets/test_stream_load.py
new file mode 100644
index 000000000..0ce468874
--- /dev/null
+++ b/tests/msdatasets/test_stream_load.py
@@ -0,0 +1,58 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope import MsDataset
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class TestStreamLoad(unittest.TestCase):
+
+    def setUp(self):
+        ...
+
+    def tearDown(self):
+        ...
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_stream_read_zstd(self):
+        repo_id: str = 'swift/chinese-c4'
+        ds = MsDataset.load(repo_id, split='train', use_streaming=True)
+        sample = next(iter(ds))
+        logger.info(sample)
+
+        assert sample['url'], f'Failed to load sample from {repo_id}'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_stream_imagefolder(self):
+        repo_id: str = 'wangxingjun778/test_new_dataset'
+        ds = MsDataset.load(repo_id, split='train', use_streaming=True)
+        sample = next(iter(ds))
+        logger.info(sample)
+
+        assert sample['image'], f'Failed to load sample from {repo_id}'
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_stream_parquet(self):
+        repo_id: str = 'swift/A-OKVQA'
+        ds = MsDataset.load(repo_id, split='train', use_streaming=True)
+        sample = next(iter(ds))
+        logger.info(sample)
+
+        assert sample['question'], f'Failed to load sample from {repo_id}'
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_stream_swift_jsonl(self):
+        repo_id: str = 'iic/MSAgent-MultiRole'
+        ds = MsDataset.load(repo_id, split='train', use_streaming=True)
+        sample = next(iter(ds))
+        logger.info(sample)
+
+        assert sample['id'], f'Failed to load sample from {repo_id}'
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_anydoor.py b/tests/pipelines/test_anydoor.py
new file mode 100644
index 000000000..0d7b69c65
--- /dev/null
+++ b/tests/pipelines/test_anydoor.py
@@ -0,0 +1,32 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import unittest
+
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv.anydoor_pipeline import AnydoorPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class AnydoorTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_to_image_generation
+        self.model_id = 'damo/AnyDoor_models'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run(self):
+        ref_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_anydoor_fg.jpg'
+        ref_mask = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_anydoor_fg_mask.png'
+        bg_image = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_anydoor_bg.png'
+        bg_mask = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_anydoor_bg_mask.png'
+        save_path = 'data/test/images/image_anydoor_gen.png'
+
+        anydoor_pipline: AnydoorPipeline = pipeline(
+            self.task, model=self.model_id)
+        out = anydoor_pipline((ref_image, ref_mask, bg_image, bg_mask))
+        image = out['output_img']
+        image.save(save_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_base.py b/tests/pipelines/test_base.py
index 434e2944d..9da92e36c 100644
--- a/tests/pipelines/test_base.py
+++ b/tests/pipelines/test_base.py
@@ -51,7 +51,7 @@ def __init__(self,
                          **kwargs):
                 super().__init__(config_file, model, preprocessor, **kwargs)
 
-        with self.assertRaises(TypeError):
+        with self.assertRaises(AttributeError):
             CustomPipeline1()
 
     def test_batch(self):
diff --git a/tests/pipelines/test_dense_optical_flow_estimation.py b/tests/pipelines/test_dense_optical_flow_estimation.py
new file mode 100644
index 000000000..59ed8f124
--- /dev/null
+++ b/tests/pipelines/test_dense_optical_flow_estimation.py
@@ -0,0 +1,39 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class DenseOpticalFlowEstimationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'dense-optical-flow-estimation'
+        self.model_id = 'Damo_XR_Lab/cv_raft_dense-optical-flow_things'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_dense_optical_flow_estimation(self):
+        input_location = [[
+            'data/test/images/dense_flow1.png',
+            'data/test/images/dense_flow2.png',
+            # 'modelscope/models/cv/dense_optical_flow_estimation/data/test/images/dense_flow1.png',
+            # 'modelscope/models/cv/dense_optical_flow_estimation/data/test/images/dense_flow2.png'
+        ]]
+        estimator = pipeline(
+            Tasks.dense_optical_flow_estimation, model=self.model_id)
+        result = estimator(input_location)
+        # flow = result[0][OutputKeys.FLOWS]
+        flow_vis = result[0][OutputKeys.FLOWS_COLOR]
+        cv2.imwrite('result.jpg', flow_vis)
+
+        print('test_dense_optical_flow_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_efficient_diffusion_tuning.py b/tests/pipelines/test_efficient_diffusion_tuning.py
index 1f2249175..af52d65fb 100644
--- a/tests/pipelines/test_efficient_diffusion_tuning.py
+++ b/tests/pipelines/test_efficient_diffusion_tuning.py
@@ -11,10 +11,10 @@
 class EfficientDiffusionTuningTest(unittest.TestCase):
 
     def setUp(self) -> None:
-        os.system('pip install ms-swift -U')
+        # os.system('pip install ms-swift -U')
         self.task = Tasks.efficient_diffusion_tuning
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip
     def test_efficient_diffusion_tuning_lora_run_pipeline(self):
         model_id = 'damo/multi-modal_efficient-diffusion-tuning-lora'
         model_revision = 'v1.0.2'
@@ -24,7 +24,7 @@ def test_efficient_diffusion_tuning_lora_run_pipeline(self):
         result = edt_pipeline(inputs)
         print(f'Efficient-diffusion-tuning-lora output: {result}.')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip
     def test_efficient_diffusion_tuning_lora_load_model_from_pretrained(self):
         model_id = 'damo/multi-modal_efficient-diffusion-tuning-lora'
         model_revision = 'v1.0.2'
@@ -32,7 +32,7 @@ def test_efficient_diffusion_tuning_lora_load_model_from_pretrained(self):
         from modelscope.models.multi_modal import EfficientStableDiffusion
         self.assertTrue(model.__class__ == EfficientStableDiffusion)
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip
     def test_efficient_diffusion_tuning_control_lora_run_pipeline(self):
         # TODO: to be fixed in the future
         model_id = 'damo/multi-modal_efficient-diffusion-tuning-control-lora'
@@ -48,7 +48,7 @@ def test_efficient_diffusion_tuning_control_lora_run_pipeline(self):
         result = edt_pipeline(inputs)
         print(f'Efficient-diffusion-tuning-control-lora output: {result}.')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip
     def test_efficient_diffusion_tuning_control_lora_load_model_from_pretrained(
             self):
         model_id = 'damo/multi-modal_efficient-diffusion-tuning-control-lora'
diff --git a/tests/pipelines/test_gpt3_text_generation.py b/tests/pipelines/test_gpt3_text_generation.py
index 6626d2bb3..72dc11799 100644
--- a/tests/pipelines/test_gpt3_text_generation.py
+++ b/tests/pipelines/test_gpt3_text_generation.py
@@ -17,26 +17,38 @@ def setUp(self) -> None:
         self.model_dir_13B = snapshot_download(self.model_id_13B)
         self.input = '好的'
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('deprecated, skipped')
     def test_gpt3_1_3B(self):
-        pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
+        pipe = pipeline(
+            Tasks.text_generation,
+            model=self.model_id_1_3B,
+            external_engine_for_llm=False)
         print(pipe(self.input))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('deprecated, skipped')
     def test_gpt3_1_3B_with_streaming(self):
-        pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
+        pipe = pipeline(
+            Tasks.text_generation,
+            model=self.model_id_1_3B,
+            external_engine_for_llm=False)
         for output in pipe.stream_generate(self.input, max_length=64):
             print(output, end='\r')
         print()
 
-    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    @unittest.skip('deprecated, skipped')
     def test_gpt3_2_7B(self):
-        pipe = pipeline(Tasks.text_generation, model=self.model_id_2_7B)
+        pipe = pipeline(
+            Tasks.text_generation,
+            model=self.model_id_2_7B,
+            external_engine_for_llm=False)
         print(pipe(self.input))
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skip('deprecated, skipped')
     def test_gpt3_1_3B_with_args(self):
-        pipe = pipeline(Tasks.text_generation, model=self.model_id_1_3B)
+        pipe = pipeline(
+            Tasks.text_generation,
+            model=self.model_id_1_3B,
+            external_engine_for_llm=False)
         print(pipe(self.input, top_p=0.9, temperature=0.9, max_length=32))
 
     @unittest.skip('distributed gpt3 13B, skipped')
@@ -62,7 +74,10 @@ def test_gpt3_13B(self):
                 |_ mp_rank_06_model_states.pt
                 |_ mp_rank_07_model_states.pt
         """
-        pipe = pipeline(Tasks.text_generation, model=self.model_dir_13B)
+        pipe = pipeline(
+            Tasks.text_generation,
+            model=self.model_dir_13B,
+            external_engine_for_llm=False)
         print(pipe(self.input))
 
 
diff --git a/tests/pipelines/test_human3d_animation.py b/tests/pipelines/test_human3d_animation.py
index 75fc4c9df..4236d076f 100644
--- a/tests/pipelines/test_human3d_animation.py
+++ b/tests/pipelines/test_human3d_animation.py
@@ -17,10 +17,11 @@ def test_run_modelhub(self):
         human3d = pipeline(self.task, model=self.model_id)
         input = {
             'dataset_id': 'damo/3DHuman_synthetic_dataset',
-            'case_id': '3f2a7538253e42a8',
+            'case_id': '000146',  # 3f2a7538253e42a8
             'action_dataset': 'damo/3DHuman_action_dataset',
             'action': 'SwingDancing',
             'save_dir': 'outputs',
+            'blender': 'blender',
         }
         output = human3d(input)
         print('saved animation file to %s' % output)
diff --git a/tests/pipelines/test_human3d_render.py b/tests/pipelines/test_human3d_render.py
index e1840af4e..fe8e30499 100644
--- a/tests/pipelines/test_human3d_render.py
+++ b/tests/pipelines/test_human3d_render.py
@@ -44,7 +44,8 @@ def test_run_modelhub(self):
         human3d = pipeline(self.task, model=self.model_id)
         input = {
             'dataset_id': 'damo/3DHuman_synthetic_dataset',
-            'case_id': '3f2a7538253e42a8',
+            'case_id': '000039',
+            'resolution': 1024,
         }
         output = human3d(input)
         self.save_results(output, './human3d_results')
diff --git a/tests/pipelines/test_human_normal_estimation.py b/tests/pipelines/test_human_normal_estimation.py
new file mode 100644
index 000000000..a2699c281
--- /dev/null
+++ b/tests/pipelines/test_human_normal_estimation.py
@@ -0,0 +1,37 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import os.path
+import unittest
+
+import cv2
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class HumanNormalEstimationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'human-normal-estimation'
+        self.model_id = 'Damo_XR_Lab/cv_human_monocular-normal-estimation'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_normal_estimation(self):
+        cur_dir = os.path.dirname(
+            os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+        input_location = f'{cur_dir}/data/test/images/human_normal_estimation.png'
+        estimator = pipeline(
+            Tasks.human_normal_estimation, model=self.model_id)
+        result = estimator(input_location)
+        normals_vis = result[OutputKeys.NORMALS_COLOR]
+
+        input_img = cv2.imread(input_location)
+        normals_vis = cv2.resize(
+            normals_vis, dsize=(input_img.shape[1], input_img.shape[0]))
+        cv2.imwrite('result.jpg', normals_vis)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_depth_estimation_marigold.py b/tests/pipelines/test_image_depth_estimation_marigold.py
new file mode 100644
index 000000000..ae33c1385
--- /dev/null
+++ b/tests/pipelines/test_image_depth_estimation_marigold.py
@@ -0,0 +1,42 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.cv import ImageDepthEstimationMarigoldPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageDepthEstimationMarigoldTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.image_depth_estimation
+        self.model_id = 'Damo_XR_Lab/cv_marigold_monocular-depth-estimation'
+        self.image = 'data/in-the-wild_example/example_0.jpg'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_model_name(self):
+        marigold = pipeline(task=self.task, model=self.model_id)
+        input_path = os.path.join(marigold.model, self.image)
+        result = marigold(input=input_path)
+        depth_vis = result[OutputKeys.DEPTHS_COLOR]
+        depth_vis.save('result_modelname.jpg')
+        print('Test run with model name ok.')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        marigold_pipe = ImageDepthEstimationMarigoldPipeline(cache_path)
+        marigold_pipe.group_key = self.task
+        input_path = os.path.join(cache_path, self.image)
+        result = marigold_pipe(input=input_path)
+        depth_vis = result[OutputKeys.DEPTHS_COLOR]
+        depth_vis.save('result_snapshot.jpg')
+        print('Test run with snapshot ok.')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_local_feature_matching.py b/tests/pipelines/test_image_local_feature_matching.py
new file mode 100644
index 000000000..1a1503db4
--- /dev/null
+++ b/tests/pipelines/test_image_local_feature_matching.py
@@ -0,0 +1,40 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+from pathlib import Path
+
+import cv2
+import matplotlib.cm as cm
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import match_pair_visualization
+from modelscope.utils.test_utils import test_level
+
+
+class ImageLocalFeatureMatchingTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'image-local-feature-matching'
+        self.model_id = 'Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_local_feature_matching(self):
+        input_location = [[
+            'data/test/images/image_matching1.jpg',
+            'data/test/images/image_matching2.jpg'
+        ]]
+        estimator = pipeline(
+            Tasks.image_local_feature_matching, model=self.model_id)
+        result = estimator(input_location)
+        kpts0, kpts1, conf = result[0][OutputKeys.MATCHES]
+        vis_img = result[0][OutputKeys.OUTPUT_IMG]
+        cv2.imwrite('vis_demo.jpg', vis_img)
+
+        print('test_image_local_feature_matching DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_matching_fast.py b/tests/pipelines/test_image_matching_fast.py
new file mode 100644
index 000000000..87769c4d9
--- /dev/null
+++ b/tests/pipelines/test_image_matching_fast.py
@@ -0,0 +1,41 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.cv.image_utils import match_pair_visualization
+from modelscope.utils.test_utils import test_level
+
+
+class ImageMatchingFastTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'image-matching'
+        self.model_id = 'Damo_XR_Lab/cv_transformer_image-matching_fast'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_matching(self):
+        input_location = [[
+            'data/test/images/image_matching1.jpg',
+            'data/test/images/image_matching2.jpg'
+        ]]
+        estimator = pipeline(Tasks.image_matching, model=self.model_id)
+        result = estimator(input_location)
+        kpts0, kpts1, confidence = result[0][OutputKeys.MATCHES]
+
+        match_pair_visualization(
+            input_location[0][0],
+            input_location[0][1],
+            kpts0,
+            kpts1,
+            confidence,
+            output_filename='lightglue-matches.png',
+            method='lightglue')
+
+        print('test_image_matching DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_mvs_depth_estimation_geomvsnet.py b/tests/pipelines/test_image_mvs_depth_estimation_geomvsnet.py
new file mode 100644
index 000000000..7f3c3a252
--- /dev/null
+++ b/tests/pipelines/test_image_mvs_depth_estimation_geomvsnet.py
@@ -0,0 +1,34 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageMVSDepthEstimationGeomvsnetTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'image-multi-view-depth-estimation'
+        self.model_id = 'Damo_XR_Lab/cv_geomvsnet_multi-view-depth-estimation_general'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_mvs_depth_estimation_gemomvsnet(self):
+        estimator = pipeline(
+            Tasks.image_multi_view_depth_estimation,
+            model='Damo_XR_Lab/cv_geomvsnet_multi-view-depth-estimation_general'
+        )
+        model_dir = snapshot_download(self.model_id)
+        input_location = os.path.join(model_dir, 'test_data')
+
+        result = estimator(input_location)
+        pcd = result[OutputKeys.OUTPUT]
+        pcd.write('./pcd_fusion.ply')
+        print('test_image_mvs_depth_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_normal_estimation.py b/tests/pipelines/test_image_normal_estimation.py
new file mode 100644
index 000000000..2ae5ca69c
--- /dev/null
+++ b/tests/pipelines/test_image_normal_estimation.py
@@ -0,0 +1,33 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+import cv2
+import numpy as np
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class ImageNormalEstimationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = 'image-normal-estimation'
+        self.model_id = 'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_image_normal_estimation(self):
+        input_location = 'data/test/images/image_normal_estimation.jpg'
+        estimator = pipeline(
+            Tasks.image_normal_estimation, model=self.model_id)
+        result = estimator(input_location)
+        normals_vis = result[OutputKeys.NORMALS_COLOR]
+        cv2.imwrite('result.jpg', normals_vis)
+
+        print('test_image_normal_estimation DONE')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_image_to_3d.py b/tests/pipelines/test_image_to_3d.py
new file mode 100644
index 000000000..ade0da86a
--- /dev/null
+++ b/tests/pipelines/test_image_to_3d.py
@@ -0,0 +1,44 @@
+# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved.
+import unittest
+
+import numpy as np
+from PIL import Image
+
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.pipelines.base import Pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from modelscope.utils.test_utils import test_level
+
+logger = get_logger()
+
+
+class ImageTo3DTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.model_id = 'Damo_XR_Lab/Syncdreamer'
+        self.input = {
+            'input_path': 'data/test/images/basketball.png',
+        }
+
+    def pipeline_inference(self, pipeline: Pipeline, input: str):
+        result = pipeline(input['input_path'])
+        np_content = []
+        for idx, img in enumerate(result['MViews']):
+            np_content.append(np.array(result['MViews'][idx]))
+
+        np_content = np.concatenate(np_content, axis=1)
+        Image.fromarray(np_content).save('./concat.png')
+
+    @unittest.skipUnless(
+        test_level() >= 1,
+        'skip for no test data: data/test/images/basketball.png')
+    def test_run_modelhub(self):
+        image_to_3d = pipeline(
+            Tasks.image_to_3d, model=self.model_id, revision='v1.0.1')
+        self.pipeline_inference(image_to_3d, self.input)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_llama2_text_generation_pipeline.py b/tests/pipelines/test_llama2_text_generation_pipeline.py
index a9db6acfc..daffba975 100644
--- a/tests/pipelines/test_llama2_text_generation_pipeline.py
+++ b/tests/pipelines/test_llama2_text_generation_pipeline.py
@@ -24,6 +24,7 @@ def run_pipeline_with_model_id(self,
                                    input,
                                    init_kwargs={},
                                    run_kwargs={}):
+        init_kwargs['external_engine_for_llm'] = False
         pipeline_ins = pipeline(task=Tasks.chat, model=model_id, **init_kwargs)
         pipeline_ins._model_prepare = True
         result = pipeline_ins(input, **run_kwargs)
@@ -36,6 +37,7 @@ def test_llama2_7B_chat_ms_with_model_name_with_chat_ch_with_args(self):
             self.llama2_model_id_7B_chat_ms,
             self.llama2_input_chat_ch,
             init_kwargs={
+                'external_engine_for_llm': False,
                 'device_map': 'auto',
                 'torch_dtype': torch.float16,
                 'model_revision': 'v1.0.5',
diff --git a/tests/pipelines/test_llm_pipeline.py b/tests/pipelines/test_llm_pipeline.py
index b5ace8107..71f028087 100644
--- a/tests/pipelines/test_llm_pipeline.py
+++ b/tests/pipelines/test_llm_pipeline.py
@@ -4,7 +4,9 @@
 import torch
 
 from modelscope import pipeline
-from modelscope.pipelines.nlp.llm_pipeline import LLMPipeline
+from modelscope.pipelines.nlp.llm_pipeline import (LLMAdapterRegistry,
+                                                   LLMPipeline,
+                                                   ModelTypeHelper)
 from modelscope.utils.test_utils import test_level
 
 
@@ -129,33 +131,35 @@ def setUp(self) -> None:
                 ]
             }]
         }
-        self.gen_cfg = {'do_sample': True, 'max_length': 512}
+        self.messages_zh_one_round = {
+            'messages': [{
+                'role': 'user',
+                'content': '你叫什么名字？'
+            }]
+        }
+        self.gen_cfg = {'do_sample': True, 'max_new_tokens': 128}
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_chatglm2(self):
-        pipe = pipeline(
-            task='chat', model='ZhipuAI/chatglm2-6b', llm_first=True)
+        pipe = pipeline(task='chat', model='ZhipuAI/chatglm2-6b')
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_chatglm2int4(self):
-        pipe = pipeline(
-            task='chat', model='ZhipuAI/chatglm2-6b-int4', llm_first=True)
+        pipe = pipeline(task='chat', model='ZhipuAI/chatglm2-6b-int4')
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_chatglm232k(self):
-        pipe = pipeline(
-            task='chat', model='ZhipuAI/chatglm2-6b-32k', llm_first=True)
+        pipe = pipeline(task='chat', model='ZhipuAI/chatglm2-6b-32k')
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_chatglm3(self):
-        pipe = pipeline(
-            task='chat', model='ZhipuAI/chatglm3-6b', llm_first=True)
+        pipe = pipeline(task='chat', model='ZhipuAI/chatglm3-6b')
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -166,8 +170,7 @@ def test_llama2(self):
             model='modelscope/Llama-2-7b-ms',
             torch_dtype=torch.float16,
             device_map='auto',
-            ignore_file_pattern=[r'.+\.bin$'],
-            llm_first=True)
+            ignore_file_pattern=[r'.+\.bin$'])
         print('messages: ', pipe(self.messages_en, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_en, **self.gen_cfg))
 
@@ -179,8 +182,7 @@ def test_llama2chat(self):
             revision='v1.0.2',
             torch_dtype=torch.float16,
             device_map='auto',
-            ignore_file_pattern=[r'.+\.bin$'],
-            llm_first=True)
+            ignore_file_pattern=[r'.+\.bin$'])
         print('messages: ', pipe(self.messages_en, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_en, **self.gen_cfg))
 
@@ -191,8 +193,7 @@ def test_codellama(self):
             model='AI-ModelScope/CodeLlama-7b-Instruct-hf',
             torch_dtype=torch.float16,
             device_map='auto',
-            ignore_file_pattern=[r'.+\.bin$'],
-            llm_first=True)
+            ignore_file_pattern=[r'.+\.bin$'])
         print('messages: ', pipe(self.messages_code, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_code, **self.gen_cfg))
 
@@ -202,8 +203,7 @@ def test_baichuan_7b(self):
             task='chat',
             model='baichuan-inc/baichuan-7B',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -213,8 +213,7 @@ def test_baichuan_13b(self):
             task='chat',
             model='baichuan-inc/Baichuan-13B-Base',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -224,8 +223,7 @@ def test_baichuan_13bchat(self):
             task='chat',
             model='baichuan-inc/Baichuan-13B-Chat',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -235,8 +233,7 @@ def test_baichuan2_7b(self):
             task='chat',
             model='baichuan-inc/Baichuan2-7B-Base',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -246,8 +243,7 @@ def test_baichuan2_7bchat(self):
             task='chat',
             model='baichuan-inc/Baichuan2-7B-Chat',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -257,8 +253,7 @@ def test_baichuan2_7bchat_int4(self):
             task='chat',
             model='baichuan-inc/Baichuan2-7B-Chat-4bits',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -268,8 +263,7 @@ def test_baichuan2_13bchat_int4(self):
             task='chat',
             model='baichuan-inc/Baichuan2-13B-Chat-4bits',
             device_map='auto',
-            torch_dtype=torch.float16,
-            llm_first=True)
+            torch_dtype=torch.float16)
         print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
@@ -280,8 +274,7 @@ def test_wizardlm_13b(self):
             model='AI-ModelScope/WizardLM-13B-V1.2',
             device_map='auto',
             torch_dtype=torch.float16,
-            format_messages='wizardlm',
-            llm_first=True)
+            format_messages='wizardlm')
         print('messages: ', pipe(self.messages_en, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_en, **self.gen_cfg))
 
@@ -292,8 +285,7 @@ def test_wizardmath(self):
             model='AI-ModelScope/WizardMath-7B-V1.0',
             device_map='auto',
             torch_dtype=torch.float16,
-            format_messages='wizardcode',
-            llm_first=True)
+            format_messages='wizardcode')
         print('messages: ', pipe(self.message_wizard_math, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_wizard_math, **self.gen_cfg))
 
@@ -304,8 +296,7 @@ def test_wizardcode_13b(self):
             model='AI-ModelScope/WizardCoder-Python-13B-V1.0',
             device_map='auto',
             torch_dtype=torch.float16,
-            format_messages='wizardcode',
-            llm_first=True)
+            format_messages='wizardcode')
         print('messages: ', pipe(self.message_wizard_code, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_wizard_code, **self.gen_cfg))
 
@@ -321,23 +312,99 @@ def test_wizardcode_1b(self):
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_qwen(self):
-        pipe = pipeline(task='chat', model='qwen/Qwen-7B-Chat', llm_first=True)
+        pipe = pipeline(task='chat', model='qwen/Qwen-7B-Chat')
         print('messages: ', pipe(self.messages_zh_with_system, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
     @unittest.skip('Need optimum and auto-gptq')
     def test_qwen_int4(self):
-        pipe = pipeline(
-            task='chat', model='qwen/Qwen-7B-Chat-Int4', llm_first=True)
+        pipe = pipeline(task='chat', model='qwen/Qwen-7B-Chat-Int4')
         print('messages: ', pipe(self.messages_zh_with_system, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_qwen_vl(self):
-        pipe = pipeline(task='chat', model='qwen/Qwen-VL-Chat', llm_first=True)
+        pipe = pipeline(task='chat', model='qwen/Qwen-VL-Chat')
         print('messages: ', pipe(self.messages_mm, **self.gen_cfg))
         print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_llm_adapter_registry(self):
+        model_id = 'damo/internlm-chat-7b-test-for-llm-pipeline'
+        model_type = ModelTypeHelper.get(model_id)
+        assert not LLMAdapterRegistry.contains(model_type)
+
+        pipe = pipeline(task='chat', model=model_id)
+        print('messages: ', pipe(self.messages_zh, **self.gen_cfg))
+        print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_qwen_stream_gemerate(self):
+        pipe = pipeline(task='chat', model='Qwen/Qwen-7B-Chat')
+        for stream_output in pipe.stream_generate(self.messages_zh_with_system,
+                                                  **self.gen_cfg):
+            print('messages: ', stream_output, end='\r')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_qwen1_5_stream_generate(self):
+        pipe = pipeline(task='chat', model='Qwen/Qwen1.5-1.8B-Chat')
+        for stream_output in pipe.stream_generate(self.messages_zh_with_system,
+                                                  **self.gen_cfg):
+            print('messages: ', stream_output, end='\r')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_baichuan2_with_swift(self):
+        pipe = pipeline(
+            task='chat',
+            model='baichuan-inc/Baichuan2-13B-Chat',
+            llm_framework='swift')
+        print('messages: ', pipe(self.messages_zh_with_system, **self.gen_cfg))
+        print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_baichuan2_stream_gemerate(self):
+        pipe = pipeline(
+            task='chat',
+            model='baichuan-inc/Baichuan2-13B-Chat',
+            llm_framework='swift')
+        for stream_output in pipe.stream_generate(self.messages_zh,
+                                                  **self.gen_cfg):
+            print('messages: ', stream_output, end='\r')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_yi_with_swift(self):
+        pipe = pipeline(
+            task='chat', model='01ai/Yi-1.5-6B-Chat', llm_framework='swift')
+        print('messages: ', pipe(self.messages_zh_with_system, **self.gen_cfg))
+        print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_yi_stream_gemerate(self):
+        pipe = pipeline(
+            task='chat', model='01ai/Yi-1.5-6B-Chat', llm_framework='swift')
+        for stream_output in pipe.stream_generate(self.messages_zh,
+                                                  **self.gen_cfg):
+            print('messages: ', stream_output, end='\r')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_internlm2_with_swift(self):
+        pipe = pipeline(
+            task='chat',
+            model='Shanghai_AI_Laboratory/internlm2-1_8b',
+            llm_framework='swift')
+        print('messages: ', pipe(self.messages_zh_one_round, **self.gen_cfg))
+        print('prompt: ', pipe(self.prompt_zh, **self.gen_cfg))
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_internlm2_stream_gemerate(self):
+        pipe = pipeline(
+            task='chat',
+            model='Shanghai_AI_Laboratory/internlm2-1_8b',
+            llm_framework='swift')
+        for stream_output in pipe.stream_generate(self.messages_zh_one_round,
+                                                  **self.gen_cfg):
+            print('messages: ', stream_output, end='\r')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_mplug_owl_multimodal_dialogue.py b/tests/pipelines/test_mplug_owl_multimodal_dialogue.py
index 06606d47b..2ad995ec4 100644
--- a/tests/pipelines/test_mplug_owl_multimodal_dialogue.py
+++ b/tests/pipelines/test_mplug_owl_multimodal_dialogue.py
@@ -17,9 +17,7 @@ def test_run_with_multimodal_dialogue_with_model(self):
         model = Model.from_pretrained(
             'damo/multi-modal_mplug_owl_multimodal-dialogue_7b')
         pipeline_multimodal_dialogue = pipeline(
-            task=Tasks.multimodal_dialogue,
-            model=model,
-        )
+            task=Tasks.multimodal_dialogue, model=model)
         image = 'data/resource/portrait_input.png'
         system_prompt_1 = 'The following is a conversation between a curious human and AI assistant.'
         system_prompt_2 = "The assistant gives helpful, detailed, and polite answers to the user's questions."
diff --git a/tests/pipelines/test_ofa_tasks.py b/tests/pipelines/test_ofa_tasks.py
index 55c3ae656..5d4709ada 100644
--- a/tests/pipelines/test_ofa_tasks.py
+++ b/tests/pipelines/test_ofa_tasks.py
@@ -316,7 +316,9 @@ def test_run_with_text_to_image_synthesis_with_model(self):
         result[OutputKeys.OUTPUT_IMG].save('result.png')
         print(f'Output written to {osp.abspath("result.png")}')
 
-    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipUnless(
+        test_level() >= 1,
+        'skip test in current test level, model has no text2phone_dict.txt')
     def test_run_with_asr_with_name(self):
         model = 'damo/ofa_mmspeech_pretrain_base_zh'
         ofa_pipe = pipeline(Tasks.auto_speech_recognition, model=model)
diff --git a/tests/pipelines/test_rife_video_frame_interpolation.py b/tests/pipelines/test_rife_video_frame_interpolation.py
new file mode 100644
index 000000000..78949e44a
--- /dev/null
+++ b/tests/pipelines/test_rife_video_frame_interpolation.py
@@ -0,0 +1,31 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import sys
+import unittest
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.models import Model
+from modelscope.outputs import OutputKeys
+from modelscope.pipelines.cv import RIFEVideoFrameInterpolationPipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.test_utils import test_level
+
+
+class RIFEVideoFrameInterpolationTest(unittest.TestCase):
+
+    def setUp(self) -> None:
+        self.task = Tasks.video_frame_interpolation
+        self.model_id = 'Damo_XR_Lab/cv_rife_video-frame-interpolation'
+        self.test_video = 'data/test/videos/video_frame_interpolation_test.mp4'
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_by_direct_model_download(self):
+        cache_path = snapshot_download(self.model_id)
+        pipeline = RIFEVideoFrameInterpolationPipeline(cache_path)
+        pipeline.group_key = self.task
+        out_video_path = pipeline(
+            input=self.test_video)[OutputKeys.OUTPUT_VIDEO]
+        print('pipeline: the output video path is {}'.format(out_video_path))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_self_supervised_depth_completion.py b/tests/pipelines/test_self_supervised_depth_completion.py
new file mode 100644
index 000000000..3a3e8de1c
--- /dev/null
+++ b/tests/pipelines/test_self_supervised_depth_completion.py
@@ -0,0 +1,54 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+
+import cv2
+import torch
+
+from modelscope import get_logger
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.msdatasets import MsDataset
+from modelscope.outputs.outputs import OutputKeys
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import DownloadMode, Tasks
+from modelscope.utils.test_utils import test_level
+
+os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
+logger = get_logger()
+
+
+class SelfSupervisedDepthCompletionTest(unittest.TestCase):
+    """class SelfSupervisedDepthCompletionTest"""
+
+    def setUp(self) -> None:
+        self.model_id = 'Damo_XR_Lab/Self_Supervised_Depth_Completion'
+        data_dir = MsDataset.load(
+            'KITTI_Depth_Dataset',
+            namespace='Damo_XR_Lab',
+            split='test',
+            download_mode=DownloadMode.FORCE_REDOWNLOAD
+        ).config_kwargs['split_config']['test']
+        self.source_dir = os.path.join(data_dir, 'selected_data')
+        logger.info(data_dir)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    @unittest.skipIf(not torch.cuda.is_available(), 'cuda unittest only')
+    def test_run(self):
+        """test running evaluation"""
+        snapshot_path = snapshot_download(self.model_id)
+        logger.info('snapshot_path: %s', snapshot_path)
+        self_supervised_depth_completion = pipeline(
+            task=Tasks.self_supervised_depth_completion,
+            model=self.model_id
+            # ,config_file = os.path.join(modelPath, "configuration.json")
+        )
+
+        result = self_supervised_depth_completion(
+            dict(model_dir=snapshot_path, source_dir=self.source_dir))
+        cv2.imwrite('result.jpg', result[OutputKeys.OUTPUT])
+        logger.info(
+            'self-supervised-depth-completion_damo.test_run_modelhub done')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/pipelines/test_speaker_verification.py b/tests/pipelines/test_speaker_verification.py
index c5fe00041..42eeb139c 100644
--- a/tests/pipelines/test_speaker_verification.py
+++ b/tests/pipelines/test_speaker_verification.py
@@ -19,9 +19,11 @@
 
 
 class SpeakerVerificationTest(unittest.TestCase):
+    tdnn_voxceleb_16k_model_id = 'iic/speech_tdnn_sv_en_voxceleb_16k'
     ecapatdnn_voxceleb_16k_model_id = 'damo/speech_ecapa-tdnn_sv_en_voxceleb_16k'
     campplus_voxceleb_16k_model_id = 'damo/speech_campplus_sv_en_voxceleb_16k'
     rdino_voxceleb_16k_model_id = 'damo/speech_rdino_ecapa_tdnn_sv_en_voxceleb_16k'
+    sdpn_voxceleb_16k_model_id = 'iic/speech_sdpn_ecapa_tdnn_sv_en_voxceleb_16k'
     speaker_change_locating_cn_model_id = 'damo/speech_campplus-transformer_scl_zh-cn_16k-common'
     speaker_change_lcoating_xvector_cn_model_id = 'damo/speech_xvector_transformer_scl_zh-cn_16k-common'
     eres2net_voxceleb_16k_model_id = 'damo/speech_eres2net_sv_en_voxceleb_16k'
@@ -31,14 +33,15 @@ class SpeakerVerificationTest(unittest.TestCase):
     lre_eres2net_base_en_cn_16k_model_id = 'damo/speech_eres2net_base_lre_en-cn_16k'
     lre_eres2net_large_en_cn_16k_model_id = 'damo/speech_eres2net_large_lre_en-cn_16k'
     eres2net_aug_zh_cn_16k_common_model_id = 'damo/speech_eres2net_sv_zh-cn_16k-common'
+    eres2netv2_zh_cn_16k_common_model_id = 'iic/speech_eres2netv2_sv_zh-cn_16k-common'
+    eres2netv2ep4_zh_cn_16k_common_model_id = 'iic/speech_eres2netv2w24s4ep4_sv_zh-cn_16k-common'
     rdino_3dspeaker_16k_model_id = 'damo/speech_rdino_ecapa_tdnn_sv_zh-cn_3dspeaker_16k'
     eres2net_base_3dspeaker_16k_model_id = 'damo/speech_eres2net_base_sv_zh-cn_3dspeaker_16k'
     eres2net_large_3dspeaker_16k_model_id = 'damo/speech_eres2net_large_sv_zh-cn_3dspeaker_16k'
+    resnet_3dspeaker_16k_model_id = 'iic/speech_resnet34_sv_zh-cn_3dspeaker_16k'
+    res2net_3dspeaker_16k_model_id = 'iic/speech_res2net_sv_zh-cn_3dspeaker_16k'
     lre_eres2net_large_five_lang_8k_model_id = 'damo/speech_eres2net_large_five_lre_8k'
 
-    def setUp(self) -> None:
-        self.task = Tasks.speaker_verification
-
     def run_pipeline(self,
                      model_id: str,
                      audios: Union[List[str], str],
@@ -46,16 +49,28 @@ def run_pipeline(self,
                      model_revision=None) -> Dict[str, Any]:
         if task is not None:
             self.task = task
+        else:
+            self.task = Tasks.speaker_verification
         p = pipeline(
             task=self.task, model=model_id, model_revision=model_revision)
         result = p(audios)
         return result
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
-    def test_run_with_speaker_verification_ecapatdnn_voxceleb_16k(self):
+    def test_run_with_speaker_verification_tdnn_voxceleb_16k(self):
         logger.info(
             'Run speaker verification for ecapatdnn_voxceleb_16k model')
+        result = self.run_pipeline(
+            model_id=self.tdnn_voxceleb_16k_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER2_A_EN_16K_WAV],
+            model_revision='v1.0.0')
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_ecapatdnn_voxceleb_16k(self):
+        logger.info(
+            'Run speaker verification for ecapatdnn_voxceleb_16k model')
         result = self.run_pipeline(
             model_id=self.ecapatdnn_voxceleb_16k_model_id,
             audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER2_A_EN_16K_WAV])
@@ -65,7 +80,6 @@ def test_run_with_speaker_verification_ecapatdnn_voxceleb_16k(self):
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_speaker_verification_campplus_voxceleb_16k(self):
         logger.info('Run speaker verification for campplus_voxceleb_16k model')
-
         result = self.run_pipeline(
             model_id=self.campplus_voxceleb_16k_model_id,
             audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER2_A_EN_16K_WAV])
@@ -82,6 +96,16 @@ def test_run_with_speaker_verification_rdino_voxceleb_16k(self):
         print(result)
         self.assertTrue(OutputKeys.SCORE in result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_sdpn_voxceleb_16k(self):
+        logger.info('Run speaker verification for sdpn_voxceleb_16k model')
+        result = self.run_pipeline(
+            model_id=self.sdpn_voxceleb_16k_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER1_B_EN_16K_WAV],
+            model_revision='v1.0.0')
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_speaker_verification_eres2net_base_3dspeaker_16k(self):
         logger.info(
@@ -104,6 +128,26 @@ def test_run_with_speaker_verification_eres2net_large_3dspeaker_16k(self):
         print(result)
         self.assertTrue(OutputKeys.SCORE in result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_resnet_3dspeaker_16k(self):
+        logger.info('Run speaker verification for resnet_3dspeaker_16k model')
+        result = self.run_pipeline(
+            model_id=self.resnet_3dspeaker_16k_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER1_B_EN_16K_WAV],
+            model_revision='v1.0.0')
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_res2net_3dspeaker_16k(self):
+        logger.info('Run speaker verification for res2net_3dspeaker_16k model')
+        result = self.run_pipeline(
+            model_id=self.res2net_3dspeaker_16k_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER1_B_EN_16K_WAV],
+            model_revision='v1.0.0')
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_speaker_verification_rdino_3dspeaker_16k(self):
         logger.info('Run speaker verification for rdino_3dspeaker_16k model')
@@ -157,6 +201,30 @@ def test_run_with_speaker_verification_eres2net_aug_zh_cn_common_16k(self):
         print(result)
         self.assertTrue(OutputKeys.SCORE in result)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_eres2netv2_zh_cn_common_16k(self):
+        logger.info(
+            'Run speaker verification for eres2netv2_zh_cn_common_16k model')
+        result = self.run_pipeline(
+            model_id=self.eres2netv2_zh_cn_16k_common_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER1_B_EN_16K_WAV],
+            model_revision='v1.0.2')
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_speaker_verification_eres2netv2ep4w24s4_zh_cn_common_16k(
+            self):
+        logger.info(
+            'Run speaker verification for eres2netv2ep4_zh_cn_common_16k model'
+        )
+        result = self.run_pipeline(
+            model_id=self.eres2netv2ep4_zh_cn_16k_common_model_id,
+            audios=[SPEAKER1_A_EN_16K_WAV, SPEAKER1_B_EN_16K_WAV],
+            model_revision='v1.0.1')
+        print(result)
+        self.assertTrue(OutputKeys.SCORE in result)
+
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_speaker_diarization_common(self):
         logger.info('Run speaker diarization task')
diff --git a/tests/pipelines/test_speech_signal_process.py b/tests/pipelines/test_speech_signal_process.py
index 6130ea313..b853f419a 100644
--- a/tests/pipelines/test_speech_signal_process.py
+++ b/tests/pipelines/test_speech_signal_process.py
@@ -150,6 +150,36 @@ def test_dfsmn_ans_stream(self):
                     w.write(pcm)
                     audio = f.read(block_size)
 
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_zipenhancer_ans(self):
+        model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(os.path.join(os.getcwd(), NOISE_SPEECH_FILE),
+            output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_zipenhancer_ans_url(self):
+        model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
+        ans = pipeline(Tasks.acoustic_noise_suppression, model=model_id)
+        output_path = os.path.abspath('output.wav')
+        ans(NOISE_SPEECH_URL, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
+    @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
+    def test_zipenhancer_ans_bytes(self):
+        model_id = 'damo/speech_zipenhancer_ans_multiloss_16k_base'
+        ans = pipeline(
+            Tasks.acoustic_noise_suppression,
+            model=model_id,
+            pipeline_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base)
+        output_path = os.path.abspath('output.wav')
+        with open(os.path.join(os.getcwd(), NOISE_SPEECH_FILE), 'rb') as f:
+            data = f.read()
+            ans(data, output_path=output_path)
+        print(f'Processed audio saved to {output_path}')
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/pipelines/test_text_error_correction.py b/tests/pipelines/test_text_error_correction.py
index b4bf5be92..b2a5fd4d2 100644
--- a/tests/pipelines/test_text_error_correction.py
+++ b/tests/pipelines/test_text_error_correction.py
@@ -41,12 +41,16 @@ def test_run_with_direct_download(self):
 
     @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
     def test_run_with_model_name_batch(self):
-        run_kwargs = {'batch_size': 2}
         pipeline_ins = pipeline(
             task=Tasks.text_error_correction, model=self.model_id)
-        print(
-            'batch: ',
-            pipeline_ins([self.input, self.input_2, self.input_3], run_kwargs))
+        sents = [
+            self.input, self.input_2, self.input_3, self.input_4,
+            self.input_law
+        ]
+        rs1 = pipeline_ins(sents, batch_size=2)
+        rs2 = pipeline_ins(sents)
+        print('batch: ', rs1, rs2)
+        self.assertEqual(rs1, rs2)
 
     @unittest.skipUnless(test_level() >= 1, 'skip test in current test level')
     def test_run_with_model_from_modelhub(self):
diff --git a/tests/pipelines/test_text_generation.py b/tests/pipelines/test_text_generation.py
index ca28a06b9..b3138462a 100644
--- a/tests/pipelines/test_text_generation.py
+++ b/tests/pipelines/test_text_generation.py
@@ -56,7 +56,10 @@ def run_pipeline_with_model_instance(self, model_id, input):
             first_sequence='sentence',
             second_sequence=None)
         pipeline_ins = pipeline(
-            task=Tasks.text_generation, model=model, preprocessor=preprocessor)
+            task=Tasks.text_generation,
+            model=model,
+            preprocessor=preprocessor,
+            external_engine_for_llm=False)
         print(pipeline_ins(input))
 
     def run_pipeline_with_model_id(self,
@@ -64,6 +67,7 @@ def run_pipeline_with_model_id(self,
                                    input,
                                    init_kwargs={},
                                    run_kwargs={}):
+        init_kwargs['external_engine_for_llm'] = False
         pipeline_ins = pipeline(
             task=Tasks.text_generation, model=model_id, **init_kwargs)
         print(pipeline_ins(input, **run_kwargs))
@@ -73,12 +77,14 @@ def run_streaming_pipeline_with_model_id(self,
                                              input,
                                              init_kwargs={},
                                              run_kwargs={}):
+        init_kwargs['external_engine_for_llm'] = False
         pipeline_ins = pipeline(
             task=Tasks.text_generation, model=model_id, **init_kwargs)
 
         # set stream inputs
         assert isinstance(pipeline_ins, StreamingOutputMixin)
-        for output in pipeline_ins.stream_generate(input, **run_kwargs):
+        for output in pipeline_ins.stream_generate(
+                input, **run_kwargs, external_engine_for_llm=False):
             print(output, end='\r')
         print()
 
@@ -256,7 +262,10 @@ def test_run_palm(self):
                 cache_path, first_sequence='sentence', second_sequence=None)
             pipeline1 = TextGenerationPipeline(model, preprocessor)
             pipeline2 = pipeline(
-                Tasks.text_generation, model=model, preprocessor=preprocessor)
+                Tasks.text_generation,
+                model=model,
+                preprocessor=preprocessor,
+                external_engine_for_llm=False)
             print(
                 f'pipeline1: {pipeline1(input)}\npipeline2: {pipeline2(input)}'
             )
@@ -272,14 +281,18 @@ def test_run_gpt3(self):
             second_sequence=None)
         pipeline1 = TextGenerationPipeline(model, preprocessor)
         pipeline2 = pipeline(
-            Tasks.text_generation, model=model, preprocessor=preprocessor)
+            Tasks.text_generation,
+            model=model,
+            preprocessor=preprocessor,
+            external_engine_for_llm=False)
         print(
             f'pipeline1: {pipeline1(self.gpt3_input)}\npipeline2: {pipeline2(self.gpt3_input)}'
         )
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_with_default_model(self):
-        pipeline_ins = pipeline(task=Tasks.text_generation)
+        pipeline_ins = pipeline(
+            task=Tasks.text_generation, external_engine_for_llm=False)
         print(
             pipeline_ins(
                 [self.palm_input_zh, self.palm_input_zh, self.palm_input_zh],
@@ -288,13 +301,17 @@ def test_run_with_default_model(self):
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_bloom(self):
         pipe = pipeline(
-            task=Tasks.text_generation, model='langboat/bloom-1b4-zh')
+            task=Tasks.text_generation,
+            model='langboat/bloom-1b4-zh',
+            external_engine_for_llm=False)
         print(pipe('中国的首都是'))
 
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_gpt_neo(self):
         pipe = pipeline(
-            task=Tasks.text_generation, model='langboat/mengzi-gpt-neo-base')
+            task=Tasks.text_generation,
+            model='langboat/mengzi-gpt-neo-base',
+            external_engine_for_llm=False)
         print(
             pipe(
                 '我是',
@@ -308,7 +325,8 @@ def test_gpt_neo(self):
     def test_gpt2(self):
         pipe = pipeline(
             task=Tasks.text_generation,
-            model='damo/nlp_gpt2_text-generation_english-base')
+            model='damo/nlp_gpt2_text-generation_english-base',
+            external_engine_for_llm=False)
         print(pipe('My name is Teven and I am'))
 
     @unittest.skip('oom error for 7b model')
diff --git a/tests/run.py b/tests/run.py
index 8836319b2..6a4ef57b4 100644
--- a/tests/run.py
+++ b/tests/run.py
@@ -438,6 +438,10 @@ def run_in_subprocess(args):
         'test_hub_revision.py',
         'test_hub_revision_release_mode.py',
         'test_hub_upload.py',
+        'test_custom_pipeline_cmd.py',
+        'test_download_cmd.py',
+        'test_modelcard_cmd.py',
+        'test_plugins_cmd.py',
     ]
     test_suite_files = [
         x for x in test_suite_files if x not in non_parallelizable_suites
@@ -501,10 +505,7 @@ def stopTest(self, test):
         self.stream.writeln(
             'Test case: %s stop at: %s, cost time: %s(seconds)' %
             (test.test_full_name, test.stop_time, test.time_cost))
-        if torch.cuda.is_available(
-        ) and test.time_cost > 5.0:  # print nvidia-smi
-            cmd = ['nvidia-smi']
-            run_command_with_popen(cmd)
+
         super(TimeCostTextTestResult, self).stopTest(test)
 
     def addSuccess(self, test):
diff --git a/tests/run_analysis.py b/tests/run_analysis.py
index ac0f2ac98..a10b2e036 100644
--- a/tests/run_analysis.py
+++ b/tests/run_analysis.py
@@ -2,7 +2,6 @@
 
 import os
 import subprocess
-import sys
 from fnmatch import fnmatch
 
 from trainers.model_trainer_map import model_trainer_map
@@ -12,11 +11,11 @@
                                         get_import_map)
 
 from modelscope.hub.api import HubApi
-from modelscope.hub.errors import NotExistError
 from modelscope.hub.file_download import model_file_download
-from modelscope.hub.utils.utils import get_cache_dir
+from modelscope.hub.utils.utils import model_id_to_group_owner_name
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile
+from modelscope.utils.file_utils import get_model_cache_dir
 from modelscope.utils.logger import get_logger
 
 logger = get_logger()
@@ -27,18 +26,21 @@ def get_models_info(groups: list) -> dict:
     api = HubApi()
     for group in groups:
         page = 1
+        total_count = 0
         while True:
             query_result = api.list_models(group, page, 100)
-            models.extend(query_result['Models'])
-            if len(models) >= query_result['TotalCount']:
+            if query_result['Models'] is not None:
+                models.extend(query_result['Models'])
+            elif total_count != 0:
+                total_count = query_result['TotalCount']
+            if len(models) >= total_count:
                 break
             page += 1
-    cache_root = get_cache_dir()
     models_info = {}  # key model id, value model info
     for model_info in models:
         model_id = '%s/%s' % (group, model_info['Name'])
-        configuration_file = os.path.join(cache_root, model_id,
-                                          ModelFile.CONFIGURATION)
+        configuration_file = os.path.join(
+            get_model_cache_dir(model_id), ModelFile.CONFIGURATION)
         if not os.path.exists(configuration_file):
             try:
                 model_revisions = api.list_model_revisions(model_id=model_id)
@@ -218,7 +220,12 @@ def get_test_suites_to_run():
         all_register_modules)
     # task_pipeline_test_suite_map key: pipeline task, value: case file path
     # trainer_test_suite_map key: trainer_name, value: case file path
-    models_info = get_models_info(['damo'])
+    iic_models_info = get_models_info(['iic'])
+    models_info = {}
+    # compatible model info
+    for model_id, model_info in iic_models_info.items():
+        _, model_name = model_id_to_group_owner_name(model_id)
+        models_info['damo/%s' % model_name] = models_info
     # model_info key: model_id, value: model info such as framework task etc.
     affected_pipeline_cases = []
     affected_trainer_cases = []
@@ -255,8 +262,10 @@ def get_test_suites_to_run():
             # ["PREPROCESSORS", "cv", "object_detection_scrfd", "SCRFDPreprocessor"]
             # ["PREPROCESSORS", domain, preprocessor_name, class_name]
             for model_id, model_info in models_info.items():
-                if model_info['preprocessor_type'] is not None and model_info[
-                        'preprocessor_type'] == affected_register_module[2]:
+                if ('preprocessor_type' in model_info
+                        and model_info['preprocessor_type'] is not None
+                        and model_info['preprocessor_type']
+                        == affected_register_module[2]):
                     task = model_info['task']
                     if task in task_pipeline_test_suite_map:
                         affected_pipeline_cases.extend(
diff --git a/tests/test_metrics/__init__.py b/tests/test_metrics/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/metrics/test_text_classification_metrics.py b/tests/test_metrics/test_text_classification_metrics.py
similarity index 100%
rename from tests/metrics/test_text_classification_metrics.py
rename to tests/test_metrics/test_text_classification_metrics.py
diff --git a/tests/metrics/test_token_classification_metrics.py b/tests/test_metrics/test_token_classification_metrics.py
similarity index 100%
rename from tests/metrics/test_token_classification_metrics.py
rename to tests/test_metrics/test_token_classification_metrics.py
diff --git a/tests/metrics/test_translation_evaluation_metrics.py b/tests/test_metrics/test_translation_evaluation_metrics.py
similarity index 100%
rename from tests/metrics/test_translation_evaluation_metrics.py
rename to tests/test_metrics/test_translation_evaluation_metrics.py
diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py
new file mode 100644
index 000000000..6dbb5f754
--- /dev/null
+++ b/tests/tools/test_to_ollama.py
@@ -0,0 +1,304 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.preprocessors.templates import TemplateType
+from modelscope.preprocessors.templates.loader import TemplateLoader
+from modelscope.utils.test_utils import test_level
+
+
+def _test_check_tmpl_type(model, tmpl_type):
+    ollama, info = TemplateLoader.to_ollama(model, debug=True)
+    assert info.__dict__.get('modelfile_prefix').split(
+        '/')[-1] == tmpl_type, info
+
+
+class TestToOllama(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_load_template(self):
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Meta-Llama-3-8B-Instruct')
+        self.assertTrue(template.template_type == TemplateType.llama3)
+
+        template = TemplateLoader.load_by_model_id(
+            'swift/Meta-Llama-3-70B-Instruct-AWQ')
+        self.assertTrue(template.template_type == TemplateType.llama3)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/DeepSeek-V2-Lite-Chat')
+        self.assertTrue(template.template_type == TemplateType.deepseek2)
+
+        template = TemplateLoader.load_by_model_id('deepseek-ai/DeepSeek-V2.5')
+        self.assertTrue(template.template_type == TemplateType.deepseek2_5)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/deepseek-coder-1.3b-instruct')
+        self.assertTrue(template.template_type == TemplateType.deepseek_coder)
+
+        template = TemplateLoader.load_by_model_id(
+            'OpenBuddy/openbuddy-deepseek-67b-v15.2')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/deepseek-llm-67b-chat')
+        self.assertTrue(template.template_type == TemplateType.deepseek)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/DeepSeek-V2-Lite-Chat')
+        self.assertTrue(template.template_type == TemplateType.deepseek2)
+
+        template = TemplateLoader.load_by_model_id('01ai/Yi-1.5-9B-Chat')
+        self.assertTrue(template.template_type == TemplateType.chatml)
+
+        template = TemplateLoader.load_by_model_id('01ai/Yi-Coder-9B-Chat')
+        self.assertTrue(template.template_type == TemplateType.yi_coder)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/gemma-2-27b-it')
+        self.assertTrue(template.template_type == TemplateType.gemma)
+
+        template = TemplateLoader.load_by_model_id('AI-ModelScope/gemma-2b')
+        self.assertTrue(template.template_type == TemplateType.gemma)
+
+        template = TemplateLoader.load_by_model_id(
+            'AI-ModelScope/gemma-2b-instruct')
+        self.assertTrue(template.template_type == TemplateType.gemma)
+
+        template = TemplateLoader.load_by_model_id(
+            'AI-ModelScope/gemma2-2b-instruct')
+        self.assertTrue(template.template_type == TemplateType.gemma)
+
+        template = TemplateLoader.load_by_model_id(
+            'AI-ModelScope/paligemma-3b-mix-224')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Phi-3-vision-128k-instruct')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Phi-3-128k-instruct')
+        self.assertTrue(template.template_type == TemplateType.phi3)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Phi-3-128k-instruct-GGUF')
+        self.assertTrue(template.template_type == TemplateType.phi3)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_load_ollama(self):
+        ollama = TemplateLoader.to_ollama(
+            'LLM-Research/Meta-Llama-3.1-8B-Instruct-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'QuantFactory/Gemma-2-Ataraxy-9B-Chat-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama('Xorbits/Llama-2-7b-Chat-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'AI-ModelScope/gemma2-2b-instruct-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'LLM-Research/Phi-3-128k-instruct-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(template_name='phi3')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama('AI-ModelScope/Yi-1.5-9B-32K-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'AI-ModelScope/llava-llama-3-8b-v1_1-gguf')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            '01ai/Yi-1.5-9B-Chat', ignore_oss_model_file=True)
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'QuantFactory/Mistral-7B-Instruct-v0.1-GGUF',
+            ignore_oss_model_file=True)
+        self.assertTrue(ollama is not None)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_check_template_type(self):
+        _test_check_tmpl_type(
+            'AI-ModelScope/Llama-3.2-11B-Vision-Instruct-GGUF',
+            'llama3.2-vision')
+        _test_check_tmpl_type('LLM-Research/Meta-Llama-3.2-8B-Instruct-GGUF',
+                              'llama3.2')
+        _test_check_tmpl_type('LLM-Research/Meta-Llama-3.1-8B-Instruct-GGUF',
+                              'llama3.1')
+        _test_check_tmpl_type('LLM-Research/Meta-Llama-3-8B-Instruct-GGUF',
+                              'llama3')
+        _test_check_tmpl_type(
+            'LLM-Research/Llama-3-8B-Instruct-Gradient-4194k-GGUF',
+            'llama3-gradient')
+        _test_check_tmpl_type('QuantFactory/Llama-3-Groq-8B-Tool-Use-GGUF',
+                              'llama3-groq-tool-use')
+        _test_check_tmpl_type('QuantFactory/Llama3-ChatQA-1.5-8B-GGUF',
+                              'llama3-chatqa')
+        _test_check_tmpl_type('SinpxAI/Llama2-Chinese-7B-Chat-GGUF',
+                              'llama2-chinese')
+        _test_check_tmpl_type('QuantFactory/dolphin-2.9-llama3-70b-GGUF',
+                              'dolphin-llama3')
+        _test_check_tmpl_type('AI-ModelScope/llava-llama-3-8b-v1_1-gguf',
+                              'llava-llama3')
+        _test_check_tmpl_type('Xorbits/Llama-2-7b-Chat-GGUF', 'llama2')
+        _test_check_tmpl_type('QuantFactory/MathCoder2-CodeLlama-7B-GGUF',
+                              'codellama')
+        _test_check_tmpl_type('QuantFactory/TinyLlama-1.1B-Chat-v1.0-GGUF',
+                              'tinyllama')
+        _test_check_tmpl_type('AI-ModelScope/LLaMA-Pro-8B-Instruct',
+                              'llama-pro')
+        _test_check_tmpl_type('LLM-Research/Llama-Guard-3-8B', 'llama-guard3')
+        _test_check_tmpl_type('Qwen/Qwen2.5-3B-Instruct-GGUF', 'qwen2.5')
+        _test_check_tmpl_type('Xorbits/Qwen-14B-Chat-GGUF', 'qwen')
+        _test_check_tmpl_type('QuantFactory/Qwen2-7B-GGUF', 'qwen2')
+        _test_check_tmpl_type('QuantFactory/Qwen2-Math-7B-GGUF', 'qwen2-math')
+        _test_check_tmpl_type('Qwen/CodeQwen1.5-7B-Chat-GGUF', 'codeqwen')
+        _test_check_tmpl_type('Qwen/Qwen2.5-Coder-7B-Instruct-GGUF',
+                              'qwen2.5-coder')
+        _test_check_tmpl_type('QuantFactory/Gemma-2-Ataraxy-9B-Chat-GGUF',
+                              'gemma2')
+        _test_check_tmpl_type(
+            'QuantFactory/Athene-codegemma-2-7b-it-alpaca-v1.1-GGUF',
+            'codegemma')
+        _test_check_tmpl_type('QuantFactory/gemma-7b-GGUF', 'gemma')
+        _test_check_tmpl_type('QuantFactory/shieldgemma-2b-GGUF',
+                              'shieldgemma')
+        _test_check_tmpl_type(
+            'ZhaoningLi/laser-dolphin-mixtral-2x7b-dpo.fp16.gguf',
+            'dolphin-mixtral')
+        _test_check_tmpl_type('QuantFactory/dolphin-2.1-mistral-7b-GGUF',
+                              'dolphin-mistral')
+        _test_check_tmpl_type('xtuner/llava-phi-3-mini', 'llava-phi3')
+        _test_check_tmpl_type('QuantFactory/Phi-3.5-mini-instruct-GGUF',
+                              'phi3.5')
+        _test_check_tmpl_type('AI-ModelScope/Phi-3-medium-128k-instruct-GGUF',
+                              'phi3')
+        _test_check_tmpl_type('QuantFactory/phi-2-GGUF', 'phi')
+        _test_check_tmpl_type('alignmentforever/alpaca-Yarn-Mistral-7b-128k',
+                              'yarn-mistral')
+        _test_check_tmpl_type('LLM-Research/Mistral-Large-Instruct-2407',
+                              'mistral-large')
+        _test_check_tmpl_type('AI-ModelScope/MistralLite', 'mistrallite')
+        _test_check_tmpl_type('AI-ModelScope/Mistral-Small-Instruct-2409',
+                              'mistral-small')
+        _test_check_tmpl_type('LLM-Research/Mistral-Nemo-Instruct-2407-GGUF',
+                              'mistral-nemo')
+        _test_check_tmpl_type('QuantFactory/Mistral-7B-OpenOrca-GGUF',
+                              'mistral-openorca')
+        _test_check_tmpl_type('QuantFactory/Mistral-7B-Instruct-v0.1-GGUF',
+                              'mistral')
+        _test_check_tmpl_type(
+            'second-state/Nous-Hermes-2-Mixtral-8x7B-SFT-GGUF',
+            'nous-hermes2-mixtral')
+        _test_check_tmpl_type('AI-ModelScope/Mixtral-8x22B-v0.1-GGUF',
+                              'mixtral')
+        _test_check_tmpl_type('QuantFactory/Nemotron-Mini-4B-Instruct-GGUF',
+                              'nemotron-mini')
+        _test_check_tmpl_type('AI-ModelScope/Nemotron-4-340B-Instruct',
+                              'nemotron')
+        _test_check_tmpl_type('TIGER-Lab/Mantis-bakllava-7b', 'bakllava')
+        _test_check_tmpl_type('fireicewolf/llava-v1.6-34B-gguf', 'llava')
+        _test_check_tmpl_type(
+            'AI-ModelScope/DeepSeek-Coder-V2-Lite-Instruct-GGUF',
+            'deepseek-coder-v2')
+        _test_check_tmpl_type('QuantFactory/deepseek-coder-6.7B-kexer-GGUF',
+                              'deepseek-coder')
+        _test_check_tmpl_type('deepseek-ai/DeepSeek-V2.5', 'deepseek-v2.5')
+        _test_check_tmpl_type('deepseek-ai/DeepSeek-V2-Lite-Chat',
+                              'deepseek-v2')
+        _test_check_tmpl_type('deepseek-ai/deepseek-llm-67b-chat',
+                              'deepseek-llm')
+        _test_check_tmpl_type('LLM-Research/glm-4-9b-chat-GGUF', 'glm4')
+        _test_check_tmpl_type('AI-ModelScope/Yi-Coder-9B-Chat-GGUF',
+                              'yi-coder')
+        _test_check_tmpl_type('01ai/Yi-1.5-9B-Chat', 'yi')
+        _test_check_tmpl_type('AI-ModelScope/c4ai-command-r-plus',
+                              'command-r-plus')
+        _test_check_tmpl_type('AI-ModelScope/c4ai-command-r-v01', 'command-r')
+        _test_check_tmpl_type('LLM-Research/codegeex4-all-9b-GGUF',
+                              'codegeex4')
+        _test_check_tmpl_type('a7823093/Wizard-Vicuna-13B-Uncensored-HF',
+                              'wizard-vicuna-uncensored')
+        _test_check_tmpl_type('AI-ModelScope/WizardLM-2-8x22B-GGUF',
+                              'wizardlm2')
+        _test_check_tmpl_type('AI-ModelScope/WizardCoder-Python-34B-V1.0',
+                              'wizardcoder')
+        _test_check_tmpl_type('AI-ModelScope/WizardMath-7B-V1.0',
+                              'wizard-math')
+        _test_check_tmpl_type('AI-ModelScope/WizardLM-7B-V1.0', 'wizardlm')
+        _test_check_tmpl_type('QuantFactory/vicuna-13b-v1.5-GGUF', 'vicuna')
+        _test_check_tmpl_type('QuantFactory/Nous-Hermes-2-SOLAR-10.7B-GGUF',
+                              'nous-hermes2')
+        _test_check_tmpl_type('QuantFactory/stable-code-instruct-3b-GGUF',
+                              'stable-code')
+        _test_check_tmpl_type('AI-ModelScope/stablelm-tuned-alpha-7b',
+                              'stablelm2')
+        _test_check_tmpl_type('QuantFactory/internlm2-chat-7b-GGUF',
+                              'internlm2')
+        _test_check_tmpl_type('openbmb/MiniCPM-V-2-gguf', 'minicpm-v')
+        _test_check_tmpl_type('QuantFactory/Codestral-22B-v0.1-GGUF',
+                              'codestral')
+        _test_check_tmpl_type('AI-ModelScope/nomic-embed-text-v1',
+                              'nomic-embed-text')
+        _test_check_tmpl_type('AI-ModelScope/mxbai-embed-large-v1',
+                              'mxbai-embed-large')
+        _test_check_tmpl_type('AI-ModelScope/starcoder2-7b', 'starcoder2')
+        _test_check_tmpl_type('QwenCollection/orca_mini_v7_72b', 'orca-mini')
+        _test_check_tmpl_type('modelscope/zephyr-7b-beta', 'zephyr')
+        _test_check_tmpl_type('LLM-Research/snowflake-arctic-embed-m',
+                              'snowflake-arctic-embed')
+        _test_check_tmpl_type('TabbyML/StarCoder-1B', 'starcoder')
+        _test_check_tmpl_type('QuantFactory/granite-8b-code-instruct-4k-GGUF',
+                              'granite-code')
+        _test_check_tmpl_type('AI-ModelScope/all-MiniLM-L6-v2', 'all-minilm')
+        _test_check_tmpl_type('QuantFactory/openchat-3.6-8b-20240522-GGUF',
+                              'openchat')
+        _test_check_tmpl_type('AI-ModelScope/aya-101', 'aya')
+        _test_check_tmpl_type('LLM-Research/OpenHermes-2.5-Mistral-7B',
+                              'openhermes')
+        _test_check_tmpl_type('AI-ModelScope/Reflection-Llama-3.1-70B',
+                              'reflection')
+        _test_check_tmpl_type('AI-ModelScope/neural-chat-7b-v3-1',
+                              'neural-chat')
+        _test_check_tmpl_type('AI-ModelScope/moondream1', 'moondream')
+        _test_check_tmpl_type('AI-ModelScope/Xwin-LM-70B-V0.1', 'xwinlm')
+        _test_check_tmpl_type(
+            'QuantFactory/smollm-360M-instruct-add-basics-GGUF', 'smollm')
+        _test_check_tmpl_type('AI-ModelScope/sqlcoder-7b-2', 'sqlcoder')
+        _test_check_tmpl_type('LLM-Research/Starling-LM-7B-beta',
+                              'starling-lm')
+        _test_check_tmpl_type('AI-ModelScope/falcon-7b', 'falcon')
+        _test_check_tmpl_type('QuantFactory/SOLAR-10.7B-v1.0-GGUF', 'solar')
+        _test_check_tmpl_type('AI-ModelScope/Orca-2-13b', 'orca2')
+        _test_check_tmpl_type('AI-ModelScope/Hermes-3-Llama-3.1-8B', 'hermes3')
+        _test_check_tmpl_type('QuantFactory/meditron-7b-GGUF', 'meditron')
+        _test_check_tmpl_type('QuantFactory/NexusRaven-V2-13B-GGUF',
+                              'nexusraven')
+        _test_check_tmpl_type('davideuler/Magicoder-s-DS-6.7B-GGUF',
+                              'magicoder')
+        _test_check_tmpl_type('ZhejiangLab-LifeScience/falcon-assistant-2',
+                              'falcon2')
+        _test_check_tmpl_type('Xorbits/bge-m3', 'bge-m3')
+        _test_check_tmpl_type('AI-ModelScope/notux-8x7b-v1', 'notux')
+        _test_check_tmpl_type('AI-ModelScope/OpenOrca-Platypus2-13B',
+                              'open-orca-platypus2')
+        _test_check_tmpl_type('QuantFactory/notus-7b-v1-GGUF', 'notus')
+        _test_check_tmpl_type('AI-ModelScope/mathstral-7B-v0.1', 'mathstral')
+        _test_check_tmpl_type('AI-ModelScope/solar-pro-preview-instruct',
+                              'solar-pro')
+        _test_check_tmpl_type('AI-ModelScope/dbrx-instruct', 'dbrx')
+        _test_check_tmpl_type('QuantFactory/NuExtract-GGUF', 'nuextract')
+        _test_check_tmpl_type('QuantFactory/reader-lm-1.5b-GGUF', 'reader-lm')
+        _test_check_tmpl_type(
+            'SDXL-LoRA/KappaNeuro-alfred-augustus-glendening-style', 'alfred')
+        _test_check_tmpl_type('AI-ModelScope/bge-large-zh-v1.5', 'bge-large')
+        _test_check_tmpl_type(
+            'Ceceliachenen/paraphrase-multilingual-MiniLM-L12-v2',
+            'paraphrase-multilingual')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tests/trainers/test_image_defrcn_fewshot_trainer.py b/tests/trainers/test_image_defrcn_fewshot_trainer.py
index 440849f1d..d042fc230 100644
--- a/tests/trainers/test_image_defrcn_fewshot_trainer.py
+++ b/tests/trainers/test_image_defrcn_fewshot_trainer.py
@@ -6,11 +6,11 @@
 import tempfile
 import unittest
 
-from modelscope.hub.utils.utils import get_cache_dir
 from modelscope.metainfo import Trainers
 from modelscope.msdatasets import MsDataset
 from modelscope.trainers import build_trainer
 from modelscope.utils.constant import DownloadMode
+from modelscope.utils.file_utils import get_model_cache_dir
 from modelscope.utils.test_utils import test_level
 
 
@@ -57,7 +57,7 @@ def base_cfg_modify_fn(cfg):
             cfg.model.roi_heads.freeze_feat = False
             cfg.model.roi_heads.cls_dropout = False
             cfg.model.weights = os.path.join(
-                get_cache_dir(), self.model_id,
+                get_model_cache_dir(self.model_id),
                 'ImageNetPretrained/MSRA/R-101.pkl')
 
             cfg.datasets.root = self.data_dir
diff --git a/tests/utils/test_hf_util.py b/tests/utils/test_hf_util.py
index e16bc6fa8..9d6b61bd3 100644
--- a/tests/utils/test_hf_util.py
+++ b/tests/utils/test_hf_util.py
@@ -2,8 +2,6 @@
 
 import unittest
 
-from transformers import LlamaForCausalLM, LlamaTokenizer
-
 from modelscope import (AutoConfig, AutoModel, AutoModelForCausalLM,
                         AutoTokenizer, GenerationConfig)
 
@@ -47,11 +45,11 @@ def test_auto_config(self):
         self.assertEqual(gen_config.assistant_token_id, 196)
 
     def test_transformer_patch(self):
-        tokenizer = LlamaTokenizer.from_pretrained(
-            'skyline2006/llama-7b', revision='v1.0.1')
+        tokenizer = AutoTokenizer.from_pretrained(
+            'iic/nlp_structbert_sentiment-classification_chinese-base')
         self.assertIsNotNone(tokenizer)
-        model = LlamaForCausalLM.from_pretrained(
-            'skyline2006/llama-7b', revision='v1.0.1')
+        model = AutoModelForCausalLM.from_pretrained(
+            'iic/nlp_structbert_sentiment-classification_chinese-base')
         self.assertIsNotNone(model)