diff --git a/.dev_scripts/build_base_image.sh b/.dev_scripts/build_base_image.sh deleted file mode 100644 index 8c8c9a0e6..000000000 --- a/.dev_scripts/build_base_image.sh +++ /dev/null @@ -1,135 +0,0 @@ -#!/bin/bash -# default values. -BASE_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04 -BASE_GPU_CUDA113_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel -BASE_GPU_CUDA117_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.7.1-cudnn8-devel -BASE_GPU_CUDA118_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.8.0-cudnn8-devel -MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope -python_version=3.7.13 -torch_version=1.11.0 -cuda_version=11.7.1 -cudatoolkit_version=11.3 -tensorflow_version=1.15.5 -version=None -is_cpu=False -function usage(){ - echo "usage: build.sh " - echo " --python=python_version set python version, default: $python_version" - echo " --cuda=cuda_version set cuda version,only[11.3.0, 11.7.1], fefault: $cuda_version" - echo " --torch=torch_version set pytorch version, fefault: $torch_version" - echo " --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version" - echo " --test option for run test before push image, only push on ci test pass" - echo " --cpu option for build cpu version" - echo " --push option for push image to remote repo" -} -for i in "$@"; do - case $i in - --python=*) - python_version="${i#*=}" - shift - ;; - --cuda=*) - cuda_version="${i#*=}" - shift # pytorch version - ;; - --torch=*) - torch_version="${i#*=}" - shift # pytorch version - ;; - --tensorflow=*) - tensorflow_version="${i#*=}" - shift # tensorflow version - ;; - --version=*) - version="${i#*=}" - shift # version - ;; - --cpu) - is_cpu=True - shift # is cpu image - ;; - --push) - is_push=True - shift # option for push image to remote repo - ;; - --help) - usage - exit 0 - ;; - -*|--*) - echo "Unknown option $i" - usage - exit 1 - ;; - *) - ;; - esac -done - -if [ "$cuda_version" == 11.3.0 ]; then - echo "Building base image cuda11.3.0" - BASE_GPU_IMAGE=$BASE_GPU_CUDA113_IMAGE - cudatoolkit_version=cu113 -elif [ "$cuda_version" == 11.7.1 ]; then - echo "Building base image cuda11.7.1" - cudatoolkit_version=cu117 - BASE_GPU_IMAGE=$BASE_GPU_CUDA117_IMAGE -elif [ "$cuda_version" == 11.8.0 ]; then - echo "Building base image cuda11.8.0" - cudatoolkit_version=cu118 - BASE_GPU_IMAGE=$BASE_GPU_CUDA118_IMAGE -else - echo "Unsupport cuda version: $cuda_version" - exit 1 -fi - -if [ "$is_cpu" == "True" ]; then - export BASE_IMAGE=$BASE_CPU_IMAGE - base_tag=ubuntu20.04 - export USE_GPU=False -else - export BASE_IMAGE=$BASE_GPU_IMAGE - base_tag=ubuntu20.04-cuda$cuda_version - export USE_GPU=True -fi -if [[ $python_version == 3.7* ]]; then - base_tag=$base_tag-py37 -elif [[ $python_version == 3.8* ]]; then - base_tag=$base_tag-py38 -else - echo "Unsupport python version: $python_version" - exit 1 -fi - -target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version-base -export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag -export PYTHON_VERSION=$python_version -export TORCH_VERSION=$torch_version -export CUDATOOLKIT_VERSION=$cudatoolkit_version -export TENSORFLOW_VERSION=$tensorflow_version -echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\n" -docker_file_content=`cat docker/Dockerfile.ubuntu_base` -printf "$docker_file_content" > Dockerfile - -while true -do - docker build -t $IMAGE_TO_BUILD \ - --build-arg USE_GPU \ - --build-arg BASE_IMAGE \ - --build-arg PYTHON_VERSION \ - --build-arg TORCH_VERSION \ - --build-arg CUDATOOLKIT_VERSION \ - --build-arg TENSORFLOW_VERSION \ - -f Dockerfile . - if [ $? -eq 0 ]; then - echo "Image build done" - break - else - echo "Running docker build command error, we will retry" - fi -done - -if [ "$is_push" == "True" ]; then - echo "Pushing image: $IMAGE_TO_BUILD" - docker push $IMAGE_TO_BUILD -fi diff --git a/.dev_scripts/build_image.sh b/.dev_scripts/build_image.sh deleted file mode 100644 index dceaaa22d..000000000 --- a/.dev_scripts/build_image.sh +++ /dev/null @@ -1,217 +0,0 @@ -#!/bin/bash -# default values. -#BASE_PY37_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py37-torch1.11.0-tf1.15.5-base -#BASE_PY38_CPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py38-torch1.11.0-tf1.15.5-base -#BASE_PY38_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py38-torch1.11.0-tf1.15.5-base -#BASE_PY38_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.7.1-py38-torch2.0.1-tf1.15.5-base -#BASE_PY38_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.7.1-py38-torch1.13.1-tf2.6.0-base -#BASE_PY37_GPU_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-base -MODELSCOPE_REPO_ADDRESS=reg.docker.alibaba-inc.com/modelscope/modelscope -python_version=3.7.13 -torch_version=1.11.0 -cudatoolkit_version=11.7 -tensorflow_version=1.15.5 -modelscope_version=None -cuda_version=11.7.1 -is_ci_test=False -is_dsw=False -is_cpu=False -run_ci_test=False -function usage(){ - echo "usage: build.sh " - echo " --python=python_version set python version, default: $python_version" - echo " --cuda=cuda_version set cuda version,only[11.3.0, 11.7.1], fefault: $cuda_version" - echo " --torch=torch_version set pytorch version, fefault: $torch_version" - echo " --tensorflow=tensorflow_version set tensorflow version, default: $tensorflow_version" - echo " --modelscope=modelscope_version set modelscope version, default: $modelscope_version" - echo " --test option for run test before push image, only push on ci test pass" - echo " --cpu option for build cpu version" - echo " --dsw option for build dsw version" - echo " --ci option for build ci version" - echo " --push option for push image to remote repo" -} -for i in "$@"; do - case $i in - --python=*) - python_version="${i#*=}" - shift - ;; - --cuda=*) - cuda_version="${i#*=}" - if [ "$cuda_version" == "11.3.0" ]; then - cudatoolkit_version=11.3 - elif [ "$cuda_version" == "11.7.1" ]; then - cudatoolkit_version=11.7 - elif [ "$cuda_version" == "11.8.0" ]; then - cudatoolkit_version=11.8 - else - echo "Unsupport cuda version $cuda_version" - exit 1 - fi - shift # pytorch version - ;; - --torch=*) - torch_version="${i#*=}" - shift # pytorch version - ;; - --tensorflow=*) - tensorflow_version="${i#*=}" - shift # tensorflow version - ;; - --cudatoolkit=*) - cudatoolkit_version="${i#*=}" - shift # cudatoolkit for pytorch - ;; - --modelscope=*) - modelscope_version="${i#*=}" - shift # modelscope version - ;; - --test) - run_ci_test=True - shift # will run ci test - ;; - --cpu) - is_cpu=True - shift # is cpu image - ;; - --ci) - is_ci_test=True - shift # is ci, will not install modelscope - ;; - --dsw) - is_dsw=True - shift # is dsw, will set dsw cache location - ;; - --push) - is_push=True - shift # option for push image to remote repo - ;; - --help) - usage - exit 0 - ;; - -*|--*) - echo "Unknown option $i" - usage - exit 1 - ;; - *) - ;; - esac -done - -if [ "$modelscope_version" == "None" ]; then - echo "ModelScope version must specify!" - exit 1 -fi -if [ "$is_cpu" == "True" ]; then - base_tag=ubuntu20.04 - export USE_GPU=False -else - base_tag=ubuntu20.04-cuda$cuda_version - export USE_GPU=True -fi - -if [[ $python_version == 3.7* ]]; then - if [ "$is_cpu" == "True" ]; then - echo "Building python3.7 cpu image" - export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py37-torch$torch_version-tf$tensorflow_version-base - else - echo "Building python3.7 gpu image" - export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda$cuda_version-py37-torch$torch_version-tf$tensorflow_version-base - fi - base_tag=$base_tag-py37 -elif [[ $python_version == 3.8* ]]; then - if [ "$is_cpu" == "True" ]; then - echo "Building python3.8 cpu image" - export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-py38-torch$torch_version-tf$tensorflow_version-base - else - echo "Building python3.8 gpu image" - export BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda$cuda_version-py38-torch$torch_version-tf$tensorflow_version-base - fi - base_tag=$base_tag-py38 -else - echo "Unsupport python version: $python_version" - exit 1 -fi - -target_image_tag=$base_tag-torch$torch_version-tf$tensorflow_version -if [ "$is_ci_test" == "True" ]; then - target_image_tag=$target_image_tag-$modelscope_version-ci -else - target_image_tag=$target_image_tag-$modelscope_version-test -fi -export IMAGE_TO_BUILD=$MODELSCOPE_REPO_ADDRESS:$target_image_tag -export PYTHON_VERSION=$python_version -export TORCH_VERSION=$torch_version -export CUDATOOLKIT_VERSION=$cudatoolkit_version -export TENSORFLOW_VERSION=$tensorflow_version -echo -e "Building image with:\npython$python_version\npytorch$torch_version\ntensorflow:$tensorflow_version\ncudatoolkit:$cudatoolkit_version\ncpu:$is_cpu\nis_ci:$is_ci_test\nis_dsw:$is_dsw\n" -docker_file_content=`cat docker/Dockerfile.ubuntu` -if [ "$is_ci_test" != "True" ]; then - echo "Building ModelScope lib, will install ModelScope lib to image" - docker_file_content="${docker_file_content} \nRUN pip install --no-cache-dir -U funasr transformers && pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/releases/build/modelscope-$modelscope_version-py3-none-any.whl " -fi -echo "$is_dsw" -if [ "$is_dsw" == "False" ]; then - echo "Not DSW image" -else - echo "Building dsw image will need set ModelScope lib cache location." - docker_file_content="${docker_file_content} \nENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope" - # pre compile extension - docker_file_content="${docker_file_content} \nRUN python -c 'from modelscope.utils.pre_compile import pre_compile_all;pre_compile_all()'" - if [ "$is_cpu" == "True" ]; then - echo 'build cpu image' - else - # fix easycv extension and tinycudann conflict. - docker_file_content="${docker_file_content} \nRUN bash /tmp/install_tiny_cuda_nn.sh" - fi -fi -if [ "$is_ci_test" == "True" ]; then - echo "Building CI image, uninstall modelscope" - docker_file_content="${docker_file_content} \nRUN pip uninstall modelscope -y" -fi -printf "$docker_file_content" > Dockerfile - -while true -do - docker build -t $IMAGE_TO_BUILD \ - --build-arg USE_GPU \ - --build-arg BASE_IMAGE \ - --build-arg PYTHON_VERSION \ - --build-arg TORCH_VERSION \ - --build-arg CUDATOOLKIT_VERSION \ - --build-arg TENSORFLOW_VERSION \ - -f Dockerfile . - if [ $? -eq 0 ]; then - echo "Image build done" - break - else - echo "Running docker build command error, we will retry" - fi -done - -if [ "$run_ci_test" == "True" ]; then - echo "Running ci case." - export MODELSCOPE_CACHE=/home/mulin.lyh/model_scope_cache - export MODELSCOPE_HOME_CACHE=/home/mulin.lyh/ci_case_home # for credential - export IMAGE_NAME=$MODELSCOPE_REPO_ADDRESS - export IMAGE_VERSION=$target_image_tag - export MODELSCOPE_DOMAIN=www.modelscope.cn - export HUB_DATASET_ENDPOINT=http://www.modelscope.cn - export CI_TEST=True - export TEST_LEVEL=1 - if [ "$is_ci_test" != "True" ]; then - echo "Testing for dsw image or MaaS-lib image" - export CI_COMMAND="python tests/run.py" - fi - bash .dev_scripts/dockerci.sh - if [ $? -ne 0 ]; then - echo "Running unittest failed, please check the log!" - exit -1 - fi -fi -if [ "$is_push" == "True" ]; then - echo "Pushing image: $IMAGE_TO_BUILD" - docker push $IMAGE_TO_BUILD -fi diff --git a/.dev_scripts/dockerci.sh b/.dev_scripts/dockerci.sh index 0278a7857..4f66073cd 100644 --- a/.dev_scripts/dockerci.sh +++ b/.dev_scripts/dockerci.sh @@ -14,6 +14,7 @@ echo "PR modified files: $PR_CHANGED_FILES" PR_CHANGED_FILES=${PR_CHANGED_FILES//[ ]/#} echo "PR_CHANGED_FILES: $PR_CHANGED_FILES" idx=0 +sleep 65 for gpu in $gpus do exec {lock_fd}>"/tmp/gpu$gpu" || exit 1 diff --git a/.dev_scripts/run_docker.sh b/.dev_scripts/run_docker.sh deleted file mode 100644 index 8999458ac..000000000 --- a/.dev_scripts/run_docker.sh +++ /dev/null @@ -1,7 +0,0 @@ -#sudo docker run --name zwm_maas -v /home/wenmeng.zwm/workspace:/home/wenmeng.zwm/workspace --net host -ti reg.docker.alibaba-inc.com/pai-dlc/tensorflow-training:2.3-gpu-py36-cu101-ubuntu18.04 bash -#sudo docker run --name zwm_maas_pytorch -v /home/wenmeng.zwm/workspace:/home/wenmeng.zwm/workspace --net host -ti reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 bash -CONTAINER_NAME=modelscope-dev -IMAGE_NAME=registry.cn-shanghai.aliyuncs.com/modelscope/modelscope -IMAGE_VERSION=v0.1.1-16-g62856fa-devel -MOUNT_DIR=/home/wenmeng.zwm/workspace -sudo docker run --name $CONTAINER_NAME -v $MOUNT_DIR:$MOUNT_DIR --net host -ti ${IMAGE_NAME}:${IMAGE_VERSION} bash diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4fdf7351f..f5a42ca45 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -3,7 +3,7 @@ name: Bug report about: Create a bug report to help us improve title: '' labels: '' -assignees: Firmament-cyou, tastelikefeet, wangxingjun778, wenmengzhou, zzclynn +assignees: tastelikefeet, wangxingjun778, yingdachen --- @@ -36,14 +36,14 @@ A clear and concise description of what the bug is. Please @ corresponding people according to your problem: -Model related: @wenmengzhou @tastelikefeet +Model related: @tastelikefeet -Model hub related: @liuyhwangyh +Model hub related: @liuyhwangyh @tastelikefeet @wangxingjun778 Dataset releated: @wangxingjun778 Finetune related: @tastelikefeet @Jintao-Huang -Pipeline related: @Firmament-cyou @wenmengzhou +Pipeline related: @tastelikefeet @wangxingjun778 -Contribute your model: @zzclynn +Contribute your model: @yingdachen diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md index 0731f3c1f..6eef2aa58 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.md +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -3,7 +3,7 @@ name: Feature request about: Suggest an idea for this project title: '' labels: '' -assignees: tastelikefeet, wangxingjun778, wenmengzhou, yingdachen, zzclynn +assignees: yingdachen, wangxingjun778, tastelikefeet --- diff --git a/.github/ISSUE_TEMPLATE/question.md b/.github/ISSUE_TEMPLATE/question.md index c7ec72562..3545e5435 100644 --- a/.github/ISSUE_TEMPLATE/question.md +++ b/.github/ISSUE_TEMPLATE/question.md @@ -3,7 +3,7 @@ name: Question about: Describe this issue template's purpose here. title: '' labels: '' -assignees: zzclynn,wenmengzhou +assignees: tastelikefeet, wangxingjun778, yingdachen --- @@ -18,7 +18,7 @@ Before asking a question, make sure you have: Please @ corresponding people according to your problem: -Model related: @wenmengzhou @tastelikefeet +Model related: @tastelikefeet Model hub related: @liuyhwangyh @@ -26,6 +26,6 @@ Dataset releated: @wangxingjun778 Finetune related: @tastelikefeet @Jintao-Huang -Pipeline related: @Firmament-cyou @wenmengzhou +Pipeline related: @tastelikefeet @wangxingjun778 -Contribute your model: @zzclynn +Contribute your model: @yingdachen diff --git a/.github/workflows/docker-image.yml b/.github/workflows/docker-image.yml new file mode 100644 index 000000000..13f61ff31 --- /dev/null +++ b/.github/workflows/docker-image.yml @@ -0,0 +1,54 @@ +name: Build Docker Image + +on: + workflow_dispatch: + inputs: + workflow_name: + description: 'The specific name of this build' + required: true + default: 'build' + modelscope_branch: + description: 'ModelScope branch to build from(release/x.xx)' + required: true + image_type: + description: 'The image type to build(cpu/gpu/llm)' + required: true + modelscope_version: + description: 'ModelScope version to use(x.xx.x)' + required: true + swift_branch: + description: 'SWIFT branch to use(release/x.xx)' + required: true + ci_image: + description: 'Set as the CI image' + default: '0' + required: false + other_params: + description: 'Other params in --xxx xxx' + required: false + +run-name: Docker-${{ inputs.modelscope_branch }}-${{ inputs.image_type }}-${{ inputs.workflow_name }}-by-@${{ github.actor }} + +jobs: + build: + runs-on: [modelscope-self-hosted-us] + + steps: + - name: ResetFileMode + shell: bash + run: | + # reset filemode to allow action runner to delete files + # generated by root in docker + set -e + source ~/.bashrc + sudo chown -R $USER:$USER $ACTION_RUNNER_DIR + - name: Checkout repository + uses: actions/checkout@v4 + with: + ref: ${{ github.event.inputs.modelscope_branch }} + + - name: Build Docker Image + run: | + set -e + source ~/.bashrc + python docker/build_image.py --image_type ${{ github.event.inputs.image_type }} --modelscope_branch ${{ github.event.inputs.modelscope_branch }} --modelscope_version ${{ github.event.inputs.modelscope_version }} --swift_branch ${{ github.event.inputs.swift_branch }} --ci_image ${{ github.event.inputs.ci_image }} ${{ github.event.inputs.other_params }} diff --git a/.github/workflows/lint.yaml b/.github/workflows/lint.yaml index dc4b5487b..6ff84517d 100644 --- a/.github/workflows/lint.yaml +++ b/.github/workflows/lint.yaml @@ -11,10 +11,10 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v2 - - name: Set up Python 3.7 + - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: '3.10' - name: Install pre-commit hook run: | pip install pre-commit diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 7c2e180a7..dacf6df78 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -15,10 +15,10 @@ jobs: #if: startsWith(github.event.ref, 'refs/tags') steps: - uses: actions/checkout@v2 - - name: Set up Python 3.7 + - name: Set up Python 3.10 uses: actions/setup-python@v2 with: - python-version: '3.7' + python-version: '3.10' - name: Install wheel run: pip install wheel && pip install -r requirements/framework.txt - name: Build ModelScope diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 7e6e9b774..a8565f16b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,5 @@ +exclude: 'modelscope/preprocessors/templates/' + repos: - repo: https://github.com/pycqa/flake8.git rev: 4.0.0 diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml index a68a5b785..869d8fd66 100644 --- a/.pre-commit-config_local.yaml +++ b/.pre-commit-config_local.yaml @@ -1,3 +1,5 @@ +exclude: 'modelscope/preprocessors/templates/' + repos: - repo: /home/admin/pre-commit/flake8 rev: 4.0.0 diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index b23f3150a..7ec11ef0b 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -61,7 +61,7 @@ representative at an online or offline event. Instances of abusive, harassing, or otherwise unacceptable behavior may be reported to the community leaders responsible for enforcement at -feedback@huggingface.co. +contact@modelscope.cn. All complaints will be reviewed and investigated promptly and fairly. All community leaders are obligated to respect the privacy and security of the diff --git a/LICENSE b/LICENSE index 14cec7de8..d64569567 100644 --- a/LICENSE +++ b/LICENSE @@ -1,4 +1,3 @@ -Copyright 2022-2023 Alibaba ModelScope. All rights reserved. Apache License Version 2.0, January 2004 @@ -188,7 +187,7 @@ Copyright 2022-2023 Alibaba ModelScope. All rights reserved. same "printed page" as the copyright notice for easier identification within third-party archives. - Copyright 2020-2022 Alibaba ModelScope. + Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/README.md b/README.md index dd6d3350e..eb65c053e 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,11 @@ +[Discord](https://discord.gg/FMupRv4jUR) + +

+modelscope%2Fmodelscope | Trendshift +

@@ -51,35 +56,36 @@ Hundreds of models are made publicly available on [ModelScope]( https://www.mode Some representative examples include: -NLP: +LLM: -* [ChatGLM3-6B](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary) +* [Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary) -* [Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary) +* [Qwen1.5-110B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat/summary) -* [Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary) +* [DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary) * [Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary) -* [Internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary) +* [Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary) -* [Udever Multilingual Universal Text Representation Model 1b1](https://modelscope.cn/models/damo/udever-bloom-1b1/summary) +* [Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary) -* [CoROM Text Vector - Chinese - E-commerce Domain - Base](https://modelscope.cn/models/damo/nlp_corom_sentence-embedding_chinese-base-ecom/summary) - -* [MGeo Address Similarity Matching Entity Alignment - Chinese - Address Field - Base](https://modelscope.cn/models/damo/mgeo_geographic_entity_alignment_chinese_base/summary) Multi-Modal: * [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary) -* [CogVLM](https://modelscope.cn/models/ZhipuAI/CogVLM/summary) +* [Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary) + +* [InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary) -* [Text-to-Video Synthesis Large Model - English - General Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary) +* [deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary) -* [I2VGen-XL High Definition Image to Video Large Model](https://modelscope.cn/models/damo/Image-to-Video/summary) +* [OpenSoraPlan](https://modelscope.cn/models/AI-ModelScope/Open-Sora-Plan-v1.0.0/summary) -* [I2VGen-XL High Definition Video to Video Large Model](https://modelscope.cn/models/damo/Video-to-Video/summary) +* [OpenSora](https://modelscope.cn/models/luchentech/OpenSora-STDiT-v1-HQ-16x512x512/summary) + +* [I2VGen-XL](https://modelscope.cn/models/iic/i2vgen-xl/summary) CV: @@ -293,3 +299,13 @@ We provide additional documentations including: # License This project is licensed under the [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). + +# Citation +``` +@Misc{modelscope, + title = {ModelScope: bring the notion of Model-as-a-Service to life.}, + author = {The ModelScope Team}, + howpublished = {\url{https://github.com/modelscope/modelscope}}, + year = {2023} +} +``` diff --git a/README_ja.md b/README_ja.md index 4523add49..c3c8c50b7 100644 --- a/README_ja.md +++ b/README_ja.md @@ -17,6 +17,11 @@ +[Discord](https://discord.gg/FMupRv4jUR) + +

+modelscope%2Fmodelscope | Trendshift +

@@ -51,33 +56,36 @@ ModelScope ライブラリは、様々なモデルの実装を保持するだけ 代表的な例をいくつか挙げると: -NLP: - -* [nlp_gpt3_text-generation_2.7B](https://modelscope.cn/models/damo/nlp_gpt3_text-generation_2.7B) +大きなモデル: -* [ChatYuan-large](https://modelscope.cn/models/ClueAI/ChatYuan-large) +* [Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary) -* [mengzi-t5-base](https://modelscope.cn/models/langboat/mengzi-t5-base) +* [Qwen1.5-110B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat/summary) -* [nlp_csanmt_translation_en2zh](https://modelscope.cn/models/damo/nlp_csanmt_translation_en2zh) +* [DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary) -* [nlp_raner_named-entity-recognition_chinese-base-news](https://modelscope.cn/models/damo/nlp_raner_named-entity-recognition_chinese-base-news) +* [Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary) -* [nlp_structbert_word-segmentation_chinese-base](https://modelscope.cn/models/damo/nlp_structbert_word-segmentation_chinese-base) +* [Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary) -* [Erlangshen-RoBERTa-330M-Sentiment](https://modelscope.cn/models/fengshenbang/Erlangshen-RoBERTa-330M-Sentiment) +* [Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary) -* [nlp_convai_text2sql_pretrain_cn](https://modelscope.cn/models/damo/nlp_convai_text2sql_pretrain_cn) マルチモーダル: -* [multi-modal_clip-vit-base-patch16_zh](https://modelscope.cn/models/damo/multi-modal_clip-vit-base-patch16_zh) +* [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary) + +* [Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary) + +* [InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary) + +* [deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary) -* [ofa_pretrain_base_zh](https://modelscope.cn/models/damo/ofa_pretrain_base_zh) +* [OpenSoraPlan](https://modelscope.cn/models/AI-ModelScope/Open-Sora-Plan-v1.0.0/summary) -* [Taiyi-Stable-Diffusion-1B-Chinese-v0.1](https://modelscope.cn/models/fengshenbang/Taiyi-Stable-Diffusion-1B-Chinese-v0.1) +* [OpenSora](https://modelscope.cn/models/luchentech/OpenSora-STDiT-v1-HQ-16x512x512/summary) -* [mplug_visual-question-answering_coco_large_en](https://modelscope.cn/models/damo/mplug_visual-question-answering_coco_large_en) +* [I2VGen-XL](https://modelscope.cn/models/iic/i2vgen-xl/summary) CV: diff --git a/README_zh.md b/README_zh.md index 10b2e7288..9c9e4248f 100644 --- a/README_zh.md +++ b/README_zh.md @@ -17,30 +17,33 @@ +[Discord](https://discord.gg/FMupRv4jUR) + +

+modelscope%2Fmodelscope | Trendshift +

English | - 中文 | - 日本語 + 中文 | + 日本語

- # 简介 -[ModelScope]( https://www.modelscope.cn) 是一个“模型即服务”(MaaS)平台,旨在汇集来自AI社区的最先进的机器学习模型,并简化在实际应用中使用AI模型的流程。ModelScope库使开发人员能够通过丰富的API设计执行推理、训练和评估,从而促进跨不同AI领域的最先进模型的统一体验。 - -ModelScope Library为模型贡献者提供了必要的分层API,以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到ModelScope生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装,用户只需几行代码即可完成模型推理、微调和评估。同时,灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。 - -除了包含各种模型的实现之外,ModelScope Library还支持与ModelScope后端服务进行必要的交互,特别是与Model-Hub和Dataset-Hub的交互。这种交互促进了模型和数据集的管理在后台无缝执行,包括模型数据集查询、版本控制、缓存管理等。 +[ModelScope](https://www.modelscope.cn) 是一个 “模型即服务”(MaaS) 平台,旨在汇集来自 AI 社区的最先进的机器学习模型,并简化在实际应用中使用 AI 模型的流程。ModelScope 库使开发人员能够通过丰富的 API 设计执行推理、训练和评估,从而促进跨不同 AI 领域的最先进模型的统一体验。 +ModelScope Library 为模型贡献者提供了必要的分层 API,以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到 ModelScope 生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装,用户只需几行代码即可完成模型推理、微调和评估。同时,灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。 +除了包含各种模型的实现之外,ModelScope Library 还支持与 ModelScope 后端服务进行必要的交互,特别是与 Model-Hub 和 Dataset-Hub 的交互。这种交互促进了模型和数据集的管理在后台无缝执行,包括模型数据集查询、版本控制、缓存管理等。 # 部分模型和在线体验 -ModelScope开源了数百个(当前700+)模型,涵盖自然语言处理、计算机视觉、语音、多模态、科学计算等,其中包含数百个SOTA模型。用户可以进入ModelScope网站([modelscope.cn](http://www.modelscope.cn))的模型中心零门槛在线体验,或者Notebook方式体验模型。 + +ModelScope 开源了数百个 (当前 700+) 模型,涵盖自然语言处理、计算机视觉、语音、多模态、科学计算等,其中包含数百个 SOTA 模型。用户可以进入 ModelScope 网站 ([modelscope.cn](http://www.modelscope.cn)) 的模型中心零门槛在线体验,或者 Notebook 方式体验模型。


@@ -50,70 +53,65 @@ ModelScope开源了数百个(当前700+)模型,涵盖自然语言处理、计 示例如下: -自然语言处理: +大模型: -* [ChatGLM3-6B](https://modelscope.cn/models/ZhipuAI/chatglm3-6b/summary) +* [Yi-1.5-34B-Chat](https://modelscope.cn/models/01ai/Yi-1.5-34B-Chat/summary) -* [Qwen-14B-Chat](https://modelscope.cn/models/qwen/Qwen-14B-Chat/summary) +* [Qwen1.5-110B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-110B-Chat/summary) -* [Baichuan2-13B-Chat](https://modelscope.cn/models/baichuan-inc/Baichuan2-13B-Chat/summary) +* [DeepSeek-V2-Chat](https://modelscope.cn/models/deepseek-ai/DeepSeek-V2-Chat/summary) * [Ziya2-13B-Chat](https://modelscope.cn/models/Fengshenbang/Ziya2-13B-Chat/summary) -* [Internlm-chat-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-chat-20b/summary) - -* [Udever-bloom-1b1](https://modelscope.cn/models/damo/udever-bloom-1b1/summary) +* [Meta-Llama-3-8B-Instruct](https://modelscope.cn/models/LLM-Research/Meta-Llama-3-8B-Instruct/summary) -* [CoROM文本向量-中文-电商领域-base](https://modelscope.cn/models/damo/nlp_corom_sentence-embedding_chinese-base-ecom/summary) - -* [MGeo地址相似度匹配实体对齐-中文-地址领域-base](https://modelscope.cn/models/damo/mgeo_geographic_entity_alignment_chinese_base/summary) +* [Phi-3-mini-128k-instruct](https://modelscope.cn/models/LLM-Research/Phi-3-mini-128k-instruct/summary) 多模态: * [Qwen-VL-Chat](https://modelscope.cn/models/qwen/Qwen-VL-Chat/summary) -* [CogVLM](https://modelscope.cn/models/ZhipuAI/CogVLM/summary) +* [Yi-VL-6B](https://modelscope.cn/models/01ai/Yi-VL-6B/summary) + +* [InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary) -* [Text-to-Video Synthesis Large Model - English - General Domain](https://modelscope.cn/models/damo/text-to-video-synthesis/summary) +* [deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary) -* [I2VGen-XL高清图片到视频大模型](https://modelscope.cn/models/damo/Image-to-Video/summary) +* [OpenSoraPlan](https://modelscope.cn/models/AI-ModelScope/Open-Sora-Plan-v1.0.0/summary) -* [I2VGen-XL高清视频到视频大模型](https://modelscope.cn/models/damo/Video-to-Video/summary) +* [OpenSora](https://modelscope.cn/models/luchentech/OpenSora-STDiT-v1-HQ-16x512x512/summary) +* [I2VGen-XL](https://modelscope.cn/models/iic/i2vgen-xl/summary) 计算机视觉: -* [DamoFD人脸检测关键点模型-0.5G](https://modelscope.cn/models/damo/cv_ddsar_face-detection_iclr23-damofd/summary) +* [DamoFD 人脸检测关键点模型-0.5G](https://modelscope.cn/models/damo/cv_ddsar_face-detection_iclr23-damofd/summary) -* [BSHM人像抠图](https://modelscope.cn/models/damo/cv_unet_image-matting/summary) +* [BSHM 人像抠图](https://modelscope.cn/models/damo/cv_unet_image-matting/summary) -* [DCT-Net人像卡通化-3D](https://modelscope.cn/models/damo/cv_unet_person-image-cartoon-3d_compound-models/summary) +* [DCT-Net 人像卡通化-3D](https://modelscope.cn/models/damo/cv_unet_person-image-cartoon-3d_compound-models/summary) -* [DCT-Net人像卡通化模型-3D](https://modelscope.cn/models/damo/face_chain_control_model/summary) +* [DCT-Net 人像卡通化模型-3D](https://modelscope.cn/models/damo/face_chain_control_model/summary) * [读光-文字识别-行识别模型-中英-通用领域](https://modelscope.cn/models/damo/cv_convnextTiny_ocr-recognition-general_damo/summary) * [读光-文字识别-行识别模型-中英-通用领域](https://modelscope.cn/models/damo/cv_resnet18_ocr-detection-line-level_damo/summary) -* [LaMa图像填充](https://modelscope.cn/models/damo/cv_fft_inpainting_lama/summary) - - - +* [LaMa 图像填充](https://modelscope.cn/models/damo/cv_fft_inpainting_lama/summary) 语音: -* [Paraformer语音识别-中文-通用-16k-离线-大型-长音频版本](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) +* [Paraformer 语音识别-中文-通用-16k-离线-大型-长音频版本](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) -* [FSMN声音端点检测-中文-通用-16k-onnx](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-onnx/summary) +* [FSMN 声音端点检测-中文-通用-16k-onnx](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-onnx/summary) -* [Monotonic-Aligner语音时间戳预测-16k-离线](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) +* [Monotonic-Aligner 语音时间戳预测-16k-离线](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) -* [CT-Transformer标点-中文-通用-onnx](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx/summary) +* [CT-Transformer 标点-中文-通用-onnx](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx/summary) * [语音合成-中文-多情绪领域-16k-多发言人](https://modelscope.cn/models/damo/speech_sambert-hifigan_tts_zh-cn_16k/summary) -* [CAM++说话人验证-中文-通用-200k发言人](https://modelscope.cn/models/damo/speech_campplus_sv_zh-cn_16k-common/summary) - +* [CAM++ 说话人验证-中文-通用-200k-发言人](https://modelscope.cn/models/damo/speech_campplus_sv_zh-cn_16k-common/summary) 科学计算: @@ -123,14 +121,15 @@ ModelScope开源了数百个(当前700+)模型,涵盖自然语言处理、计 # 快速上手 -我们针对不同任务提供了统一的使用接口, 使用`pipeline`进行模型推理、使用`Trainer`进行微调和评估。 +我们针对不同任务提供了统一的使用接口, 使用 `pipeline` 进行模型推理、使用 `Trainer` 进行微调和评估。 + +对于任意类型输入(图像、文本、音频、视频...)的任何任务,只需 3 行代码即可加载模型并获得推理结果,如下所示: -对于任意类型输入(图像、文本、音频、视频...)的任何任务,只需3行代码即可加载模型并获得推理结果,如下所示: ```python >>> from modelscope.pipelines import pipeline ->>> word_segmentation = pipeline('word-segmentation',model='damo/nlp_structbert_word-segmentation_chinese-base') ->>> word_segmentation('今天天气不错,适合出去游玩') -{'output': '今天 天气 不错 , 适合 出去 游玩'} +>>> word_segmentation = pipeline ('word-segmentation',model='damo/nlp_structbert_word-segmentation_chinese-base') +>>> word_segmentation (' 今天天气不错,适合出去游玩 ') +{'output': ' 今天 天气 不错 , 适合 出去 游玩 '} ``` 给定一张图片,你可以使用如下代码进行人像抠图. @@ -141,42 +140,44 @@ ModelScope开源了数百个(当前700+)模型,涵盖自然语言处理、计 >>> import cv2 >>> from modelscope.pipelines import pipeline ->>> portrait_matting = pipeline('portrait-matting') ->>> result = portrait_matting('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matting.png') ->>> cv2.imwrite('result.png', result['output_img']) +>>> portrait_matting = pipeline ('portrait-matting') +>>> result = portrait_matting ('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matting.png') +>>> cv2.imwrite ('result.png', result ['output_img']) ``` + 输出图像如下 ![image](data/resource/portrait_output.png) -对于微调和评估模型, 你需要通过十多行代码构建dataset和trainer,调用`trainer.train()`和`trainer.evaluate()`即可。 +对于微调和评估模型, 你需要通过十多行代码构建 dataset 和 trainer,调用 `trainer.train ()` 和 `trainer.evaluate ()` 即可。 + +例如我们利用 gpt3 1.3B 的模型,加载是诗歌数据集进行 finetune,可以完成古诗生成模型的训练。 -例如我们利用gpt3 1.3B的模型,加载是诗歌数据集进行finetune,可以完成古诗生成模型的训练。 ```python >>> from modelscope.metainfo import Trainers >>> from modelscope.msdatasets import MsDataset >>> from modelscope.trainers import build_trainer ->>> train_dataset = MsDataset.load('chinese-poetry-collection', split='train'). remap_columns({'text1': 'src_txt'}) ->>> eval_dataset = MsDataset.load('chinese-poetry-collection', split='test').remap_columns({'text1': 'src_txt'}) +>>> train_dataset = MsDataset.load ('chinese-poetry-collection', split='train'). remap_columns ({'text1': 'src_txt'}) +>>> eval_dataset = MsDataset.load ('chinese-poetry-collection', split='test').remap_columns ({'text1': 'src_txt'}) >>> max_epochs = 10 >>> tmp_dir = './gpt3_poetry' ->>> kwargs = dict( +>>> kwargs = dict ( model='damo/nlp_gpt3_text-generation_1.3B', train_dataset=train_dataset, eval_dataset=eval_dataset, max_epochs=max_epochs, work_dir=tmp_dir) ->>> trainer = build_trainer(name=Trainers.gpt3_trainer, default_args=kwargs) ->>> trainer.train() +>>> trainer = build_trainer (name=Trainers.gpt3_trainer, default_args=kwargs) +>>> trainer.train () ``` -# 为什么要用ModelScope library +# 为什么要用 ModelScope Library -1. 针对不同任务、不同模型抽象了统一简洁的用户接口,3行代码完成推理,10行代码完成模型训练,方便用户使用ModelScope社区中多个领域的不同模型,开箱即用,便于AI入门和教学。 +1. 针对不同任务、不同模型抽象了统一简洁的用户接口,3 行代码完成推理,10 行代码完成模型训练,方便用户使用 ModelScope 社区中多个领域的不同模型,开箱即用,便于 AI 入门和教学。 -2. 构造以模型为中心的开发应用体验,支持模型训练、推理、导出部署,方便用户基于ModelScope Library构建自己的MLOps. +2. 构造以模型为中心的开发应用体验,支持模型训练、推理、导出部署,方便用户基于 ModelScope Library 构建自己的 MLOps. 3. 针对模型推理、训练流程,进行了模块化的设计,并提供了丰富的功能模块实现,方便用户定制化开发来自定义自己的推理、训练等过程。 @@ -185,11 +186,13 @@ ModelScope开源了数百个(当前700+)模型,涵盖自然语言处理、计 # 安装 ## 镜像 -ModelScope Library目前支持tensorflow,pytorch深度学习框架进行模型训练、推理, 在Python 3.7+, Pytorch 1.8+, Tensorflow1.15/Tensorflow2.0+测试可运行。 -为了让大家能直接用上ModelScope平台上的所有模型,无需配置环境,ModelScope提供了官方镜像,方便有需要的开发者获取。地址如下: +ModelScope Library 目前支持 tensorflow,pytorch 深度学习框架进行模型训练、推理, 在 Python 3.7+, Pytorch 1.8+, Tensorflow1.15/Tensorflow2.0 + 测试可运行。 + +为了让大家能直接用上 ModelScope 平台上的所有模型,无需配置环境,ModelScope 提供了官方镜像,方便有需要的开发者获取。地址如下: + +CPU 镜像 -CPU镜像 ```shell # py37 registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py37-torch1.11.0-tf1.15.5-1.6.1 @@ -198,7 +201,8 @@ registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py37-to registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py38-torch2.0.1-tf2.13.0-1.9.5 ``` -GPU镜像 +GPU 镜像 + ```shell # py37 registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.6.1 @@ -207,81 +211,91 @@ registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11. registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.8.0-py38-torch2.0.1-tf2.13.0-1.9.5 ``` -## 搭建本地Python环境 +## 搭建本地 Python 环境 + +你也可以使用 pip 和 conda 搭建本地 python 环境,ModelScope 支持 python3.7 + 以上环境,我们推荐使用 [Anaconda](https://docs.anaconda.com/anaconda/install/),安装完成后,执行如下命令为 modelscope library 创建对应的 python 环境: -你也可以使用pip和conda搭建本地python环境,ModelScope支持python3.7+以上环境,我们推荐使用[Anaconda](https://docs.anaconda.com/anaconda/install/),安装完成后,执行如下命令为modelscope library创建对应的python环境: ```shell conda create -n modelscope python=3.8 conda activate modelscope ``` 接下来根据所需使用的模型依赖安装底层计算框架 -* 安装Pytorch [文档链接](https://pytorch.org/get-started/locally/) -* 安装tensorflow [文档链接](https://www.tensorflow.org/install/pip) +* 安装 Pytorch [文档链接](https://pytorch.org/get-started/locally/) +* 安装 tensorflow [文档链接](https://www.tensorflow.org/install/pip) + +安装完前置依赖,你可以按照如下方式安装 ModelScope Library。 -安装完前置依赖,你可以按照如下方式安装ModelScope Library。 +ModelScope Libarary 由核心框架,以及不同领域模型的对接组件组成。如果只需要 ModelScope 模型和数据集访问等基础能力,可以只安装 ModelScope 的核心框架: -ModelScope Libarary由核心框架,以及不同领域模型的对接组件组成。如果只需要ModelScope模型和数据集访问等基础能力,可以只安装ModelScope的核心框架: ```shell pip install modelscope ``` 如仅需体验多模态领域的模型,可执行如下命令安装领域依赖: + ```shell -pip install modelscope[multi-modal] +pip install modelscope [multi-modal] ``` -如仅需体验NLP领域模型,可执行如下命令安装领域依赖(因部分依赖由ModelScope独立host,所以需要使用"-f"参数): +如仅需体验 NLP 领域模型,可执行如下命令安装领域依赖(因部分依赖由 ModelScope 独立 host,所以需要使用 "-f" 参数): + ```shell -pip install modelscope[nlp] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +pip install modelscope [nlp] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ``` -If you want to use cv models: +如仅需体验计算机视觉领域的模型,可执行如下命令安装领域依赖(因部分依赖由 ModelScope 独立 host,所以需要使用 "-f" 参数): + ```shell -pip install modelscope[cv] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +pip install modelscope [cv] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ``` -如仅需体验语音领域模型,可执行如下命令安装领域依赖(因部分依赖由ModelScope独立host,所以需要使用"-f"参数): +如仅需体验语音领域模型,可执行如下命令安装领域依赖(因部分依赖由 ModelScope 独立 host,所以需要使用 "-f" 参数): + ```shell -pip install modelscope[audio] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +pip install modelscope [audio] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ``` -`注意`:当前大部分语音模型需要在Linux环境上使用,并且推荐使用python3.7 + tensorflow 1.x的组合。 +`注意`:当前大部分语音模型需要在 Linux 环境上使用,并且推荐使用 python3.7 + tensorflow 1.x 的组合。 + +如仅需体验科学计算领域模型,可执行如下命令安装领域依赖(因部分依赖由 ModelScope 独立 host,所以需要使用 "-f" 参数): -如仅需体验科学计算领域模型,可执行如下命令安装领域依赖(因部分依赖由ModelScope独立host,所以需要使用"-f"参数): ```shell -pip install modelscope[science] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html +pip install modelscope [science] -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html ``` -`注`: -1. 目前部分语音相关的模型仅支持 python3.7,tensorflow1.15.4的Linux环境使用。 其他绝大部分模型可以在windows、mac(x86)上安装使用。. +`注意`: + +1. 目前部分语音相关的模型仅支持 python3.7,tensorflow1.15.4 的 Linux 环境使用。 其他绝大部分模型可以在 windows、mac(x86)上安装使用。 + +2. 语音领域中一部分模型使用了三方库 SoundFile 进行 wav 文件处理,在 Linux 系统上用户需要手动安装 SoundFile 的底层依赖库 libsndfile,在 Windows 和 MacOS 上会自动安装不需要用户操作。详细信息可参考 [SoundFile 官网](https://github.com/bastibe/python-soundfile#installation)。以 Ubuntu 系统为例,用户需要执行如下命令: -2. 语音领域中一部分模型使用了三方库SoundFile进行wav文件处理,在Linux系统上用户需要手动安装SoundFile的底层依赖库libsndfile,在Windows和MacOS上会自动安装不需要用户操作。详细信息可参考[SoundFile 官网](https://github.com/bastibe/python-soundfile#installation)。以Ubuntu系统为例,用户需要执行如下命令: ```shell sudo apt-get update sudo apt-get install libsndfile1 ``` -3. CV领域的少数模型,需要安装mmcv-full, 如果运行过程中提示缺少mmcv,请参考mmcv[安装手册](https://github.com/open-mmlab/mmcv#installation)进行安装。 这里提供一个最简版的mmcv-full安装步骤,但是要达到最优的mmcv-full的安装效果(包括对于cuda版本的兼容),请根据自己的实际机器环境,以mmcv官方安装手册为准。 +3. CV 领域的少数模型,需要安装 mmcv-full, 如果运行过程中提示缺少 mmcv,请参考 mmcv [安装手册](https://github.com/open-mmlab/mmcv#installation) 进行安装。 这里提供一个最简版的 mmcv-full 安装步骤,但是要达到最优的 mmcv-full 的安装效果(包括对于 cuda 版本的兼容),请根据自己的实际机器环境,以 mmcv 官方安装手册为准。 + ```shell pip uninstall mmcv # if you have installed mmcv, uninstall it pip install -U openmim mim install mmcv-full ``` - # 更多教程 除了上述内容,我们还提供如下信息: + * [更加详细的安装文档](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85) * [任务的介绍](https://modelscope.cn/docs/%E4%BB%BB%E5%8A%A1%E7%9A%84%E4%BB%8B%E7%BB%8D) * [模型推理](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E6%8E%A8%E7%90%86Pipeline) * [模型微调](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AE%AD%E7%BB%83Train) * [数据预处理](https://modelscope.cn/docs/%E6%95%B0%E6%8D%AE%E7%9A%84%E9%A2%84%E5%A4%84%E7%90%86) * [模型评估](https://modelscope.cn/docs/%E6%A8%A1%E5%9E%8B%E7%9A%84%E8%AF%84%E4%BC%B0) -* [贡献模型到ModelScope](https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88) +* [贡献模型到 ModelScope](https://modelscope.cn/docs/ModelScope%E6%A8%A1%E5%9E%8B%E6%8E%A5%E5%85%A5%E6%B5%81%E7%A8%8B%E6%A6%82%E8%A7%88) # License -本项目使用[Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). +本项目使用 [Apache License (Version 2.0)](https://github.com/modelscope/modelscope/blob/master/LICENSE). diff --git a/data/test b/data/test index 77a9ad7fb..dedb3ce44 160000 --- a/data/test +++ b/data/test @@ -1 +1 @@ -Subproject commit 77a9ad7fb3cc4bcc99f4a33822c813e7ab473ba0 +Subproject commit dedb3ce44796328b58a2aa47d3434037a9d63c7f diff --git a/docker/.dockerignore b/docker/.dockerignore index 14284cb62..0fc13a9b7 100644 --- a/docker/.dockerignore +++ b/docker/.dockerignore @@ -1,4 +1,3 @@ -*.sh *.md *.dockerfile *.zip diff --git a/docker/Dockerfile.extra_install b/docker/Dockerfile.extra_install new file mode 100644 index 000000000..a815f7123 --- /dev/null +++ b/docker/Dockerfile.extra_install @@ -0,0 +1,139 @@ +ENV TZ=Asia/Shanghai +ENV arch=x86_64 +SHELL ["/bin/bash", "-c"] +COPY docker/rcfiles /tmp/resources +RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall ca-certificates && \ + apt-get install -y make apt-utils openssh-server locales wget git strace gdb sox libopenmpi-dev curl \ + iputils-ping net-tools iproute2 autoconf automake gperf libre2-dev libssl-dev \ + libtool libcurl4-openssl-dev libb64-dev libgoogle-perftools-dev patchelf \ + rapidjson-dev scons software-properties-common pkg-config unzip zlib1g-dev \ + libbz2-dev libreadline-dev libsqlite3-dev llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev liblzma-dev \ + libarchive-dev libxml2-dev libnuma-dev cmake \ + libgeos-dev strace vim ffmpeg libsm6 tzdata language-pack-zh-hans \ + ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build \ + libjpeg-dev libpng-dev && \ + wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \ + dpkg -i ./git-lfs_3.2.0_amd64.deb && \ + rm -f ./git-lfs_3.2.0_amd64.deb && \ + locale-gen zh_CN && \ + locale-gen zh_CN.utf8 && \ + update-locale LANG=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 && \ + ln -fs /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ + dpkg-reconfigure --frontend noninteractive tzdata && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 +RUN wget -O /tmp/boost.tar.gz https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz && \ + cd /tmp && tar xzf boost.tar.gz && \ + mv /tmp/boost_1_80_0/boost /usr/include/boost && \ + rm -rf /tmp/boost_1_80_0 && rm -rf boost.tar.gz + +#install and config python copy from https://github.com/docker-library/python/blob/1b7a1106674a21e699b155cbd53bf39387284cca/3.10/bookworm/Dockerfile +ARG PYTHON_VERSION={python_version} +ENV PATH /usr/local/bin:$PATH +ENV GPG_KEY A035C8C19219BA821ECEA86B64E628F8D684696D +ENV PYTHON_VERSION {python_version} + +#install and config python copy from https://github.com/docker-library/python/blob/1b7a1106674a21e699b155cbd53bf39387284cca/3.10/bookworm/Dockerfile +ARG PYTHON_VERSION={python_version} +ENV PATH /usr/local/bin:$PATH +ENV GPG_KEY A035C8C19219BA821ECEA86B64E628F8D684696D +ENV PYTHON_VERSION {python_version} + +RUN set -eux; \ + \ + wget -O python.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz"; \ + wget -O python.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc"; \ + GNUPGHOME="$(mktemp -d)"; export GNUPGHOME; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys "$GPG_KEY"; \ + gpg --batch --verify python.tar.xz.asc python.tar.xz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" python.tar.xz.asc; \ + mkdir -p /usr/src/python; \ + tar --extract --directory /usr/src/python --strip-components=1 --file python.tar.xz; \ + rm python.tar.xz; \ + \ + cd /usr/src/python; \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ + ./configure \ + --build="$gnuArch" \ + --enable-loadable-sqlite-extensions \ + --enable-optimizations \ + --enable-option-checking=fatal \ + --enable-shared \ + --with-lto \ + --with-system-expat \ + --without-ensurepip \ + ; \ + nproc="$(nproc)"; \ + EXTRA_CFLAGS="$(dpkg-buildflags --get CFLAGS)"; \ + LDFLAGS="$(dpkg-buildflags --get LDFLAGS)"; \ + make -j "$nproc" \ + "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \ + "LDFLAGS=${LDFLAGS:-}" \ + "PROFILE_TASK=${PROFILE_TASK:-}" \ + ; \ + rm python; \ + make -j "$nproc" \ + "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \ + "LDFLAGS=${LDFLAGS:--Wl},-rpath='\$\$ORIGIN/../lib'" \ + "PROFILE_TASK=${PROFILE_TASK:-}" \ + python \ + ; \ + make install; \ + \ + bin="$(readlink -ve /usr/local/bin/python3)"; \ + dir="$(dirname "$bin")"; \ + mkdir -p "/usr/share/gdb/auto-load/$dir"; \ + cp -vL Tools/gdb/libpython.py "/usr/share/gdb/auto-load/$bin-gdb.py"; \ + \ + cd /; \ + rm -rf /usr/src/python; \ + \ + find /usr/local -depth \ + \( \ + \( -type d -a \( -name test -o -name tests -o -name idle_test \) \) \ + -o \( -type f -a \( -name '*.pyc' -o -name '*.pyo' -o -name 'libpython*.a' \) \) \ + \) -exec rm -rf '{}' + \ + ; \ + \ + ldconfig; \ + \ + python3 --version + +# make some useful symlinks that are expected to exist ("/usr/local/bin/python" and friends) +RUN set -eux; \ + for src in idle3 pydoc3 python3 python3-config; do \ + dst="$(echo "$src" | tr -d 3)"; \ + [ -s "/usr/local/bin/$src" ]; \ + [ ! -e "/usr/local/bin/$dst" ]; \ + ln -svT "$src" "/usr/local/bin/$dst"; \ + done + +# if this is called "PIP_VERSION", pip explodes with "ValueError: invalid truth value ''" +ENV PYTHON_PIP_VERSION 23.0.1 +# https://github.com/docker-library/python/issues/365 +ENV PYTHON_SETUPTOOLS_VERSION 65.5.1 +# https://github.com/pypa/get-pip +ENV PYTHON_GET_PIP_URL https://github.com/pypa/get-pip/raw/dbf0c85f76fb6e1ab42aa672ffca6f0a675d9ee4/public/get-pip.py +ENV PYTHON_GET_PIP_SHA256 dfe9fd5c28dc98b5ac17979a953ea550cec37ae1b47a5116007395bfacff2ab9 + +RUN set -eux; \ + \ + wget -O get-pip.py "$PYTHON_GET_PIP_URL"; \ + echo "$PYTHON_GET_PIP_SHA256 *get-pip.py" | sha256sum -c -; \ + \ + export PYTHONDONTWRITEBYTECODE=1; \ + \ + python get-pip.py \ + --disable-pip-version-check \ + --no-cache-dir \ + --no-compile \ + "pip==$PYTHON_PIP_VERSION" \ + "setuptools==$PYTHON_SETUPTOOLS_VERSION" \ + ; \ + rm -f get-pip.py; \ + \ + pip --version +# end of install python diff --git a/docker/Dockerfile.ubuntu b/docker/Dockerfile.ubuntu index 4ac4fd533..0ec13d124 100644 --- a/docker/Dockerfile.ubuntu +++ b/docker/Dockerfile.ubuntu @@ -1,12 +1,30 @@ -ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-base -FROM $BASE_IMAGE +FROM {base_image} -RUN apt-get update && apt-get install -y iputils-ping net-tools iproute2 && \ +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai +ENV arch=x86_64 + +COPY docker/scripts/modelscope_env_init.sh /usr/local/bin/ms_env_init.sh +RUN apt-get update && \ + apt-get install -y libsox-dev unzip libaio-dev zip iputils-ping telnet sudo git net-tools && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# install modelscope + +{extra_content} + +COPY {meta_file} /tmp/install.sh + +ARG INSTALL_MS_DEPS={install_ms_deps} + +# install dependencies COPY requirements /var/modelscope -RUN pip install --no-cache-dir --upgrade pip && \ + +RUN pip uninstall ms-swift modelscope -y && pip --no-cache-dir install pip==23.* -U && \ + pip install --no-cache-dir apex -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ +if [ "$INSTALL_MS_DEPS" = "True" ]; then \ + pip --no-cache-dir install omegaconf==2.0.6 && \ + pip install --no-cache-dir 'cython<=0.29.36' versioneer 'numpy<2.0' -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ + pip install --no-cache-dir kwsbp==0.0.6 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/audio.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/cv.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ @@ -14,49 +32,39 @@ RUN pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir -r /var/modelscope/nlp.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/science.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ pip install --no-cache-dir -r /var/modelscope/tests.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ - pip cache purge + pip install --no-cache-dir -r /var/modelscope/server.txt && \ + pip install --no-cache-dir https://modelscope.oss-cn-beijing.aliyuncs.com/packages/imageio_ffmpeg-0.4.9-py3-none-any.whl --no-dependencies --force && \ + pip install adaseq pai-easycv && \ + pip install --no-cache-dir 'scipy<1.13.0' && \ + pip install --no-cache-dir funtextprocessing typeguard==2.13.3 scikit-learn -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ + pip install --no-cache-dir text2sql_lgesql==1.3.0 git+https://github.com/jin-s13/xtcocoapi.git@v1.14 git+https://github.com/gatagat/lap.git@v0.4.0 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html --force --no-deps && \ + pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 mpi4py paint_ldm ipykernel fasttext -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ + pip cache purge; \ +else \ + pip install --no-cache-dir -r /var/modelscope/framework.txt -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ + pip cache purge; \ +fi -# install jupyter plugin -RUN mkdir -p /root/.local/share/jupyter/labextensions/ && \ - cp -r /tmp/resources/jupyter_plugins/* /root/.local/share/jupyter/labextensions/ +RUN echo "cache bust $(date +%Y%m%d%H%M%S)" && \ + sh /tmp/install.sh {version_args} && \ + curl -fsSL https://ollama.com/install.sh | sh && \ + pip install --no-cache-dir -U funasr scikit-learn && \ + pip install --no-cache-dir -U qwen_vl_utils pyav librosa timm transformers accelerate peft trl safetensors && \ + cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {modelscope_branch} --single-branch https://github.com/modelscope/modelscope.git && \ + cd modelscope && pip install . -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html && \ + cd / && rm -fr /tmp/modelscope && pip cache purge; \ + cd /tmp && GIT_LFS_SKIP_SMUDGE=1 git clone -b {swift_branch} --single-branch https://github.com/modelscope/ms-swift.git && \ + cd ms-swift && pip install .[llm] && \ + pip install .[eval] && pip install evalscope -U --no-dependencies && pip install xtuner --no-dependencies && \ + cd / && rm -fr /tmp/ms-swift && pip cache purge; \ + pip install --no-cache-dir torch=={torch_version} torchvision=={torchvision_version} torchaudio=={torchaudio_version} {index_url} && \ + pip install --no-cache-dir transformers huggingface-hub==0.25.* -U && pip cache purge; \ + pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \ + pip config set install.trusted-host mirrors.aliyun.com && \ + cp /tmp/resources/ubuntu2204.aliyun /etc/apt/sources.list -COPY docker/scripts/modelscope_env_init.sh /usr/local/bin/ms_env_init.sh -# python3.8 pip install git+https://github.com/jin-s13/xtcocoapi.git@v1.13 -# pip install git+https://github.com/gatagat/lap.git@v0.4.0 -RUN pip install --no-cache-dir text2sql_lgesql==1.3.0 \ - git+https://github.com/jin-s13/xtcocoapi.git@v1.13 \ - git+https://github.com/gatagat/lap.git@v0.4.0 \ - detectron2==0.3 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html --force --no-deps - -RUN pip install --no-cache-dir mpi4py paint_ldm \ - mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 pai-easycv ms_swift \ - ipykernel fasttext fairseq deepspeed -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html - -ARG USE_GPU -# for cpu install cpu version faiss, faiss depends on blas lib, we install libopenblas TODO rename gpu or cpu version faiss -RUN if [ "$USE_GPU" = "True" ] ; then \ - pip install --no-cache-dir funtextprocessing kwsbp==0.0.6 faiss==1.7.2 safetensors typeguard==2.13.3 scikit-learn librosa==0.9.2 funasr -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ - else \ - pip install --no-cache-dir funtextprocessing kwsbp==0.0.6 https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/faiss-1.7.2-py37-none-linux_x86_64.whl safetensors typeguard==2.13.3 scikit-learn librosa==0.9.2 funasr -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ - fi - -RUN pip install --no-cache-dir wenetruntime==1.11.0 adaseq --no-deps -COPY examples /modelscope/examples - -# for pai-easycv setup compatiblity issue ENV SETUPTOOLS_USE_DISTUTILS=stdlib - -RUN if [ "$USE_GPU" = "True" ] ; then \ - CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6" pip install --no-cache-dir 'git+https://github.com/facebookresearch/detectron2.git'; \ - else \ - echo 'cpu unsupport detectron2'; \ - fi - -# torchmetrics==0.11.4 for ofa -RUN pip install --no-cache-dir jupyterlab torchmetrics==0.11.4 tiktoken transformers_stream_generator 'protobuf<=3.20.0' bitsandbytes basicsr -COPY docker/scripts/install_flash_attension.sh /tmp/install_flash_attension.sh -RUN if [ "$USE_GPU" = "True" ] ; then \ - bash /tmp/install_flash_attension.sh; \ - else \ - echo 'cpu unsupport flash attention'; \ - fi +ENV VLLM_USE_MODELSCOPE=True +ENV LMDEPLOY_USE_MODELSCOPE=True +ENV MODELSCOPE_CACHE=/mnt/workspace/.cache/modelscope +SHELL ["/bin/bash", "-c"] diff --git a/docker/Dockerfile.ubuntu_base b/docker/Dockerfile.ubuntu_base index b848e1a12..903c99309 100644 --- a/docker/Dockerfile.ubuntu_base +++ b/docker/Dockerfile.ubuntu_base @@ -1,20 +1,20 @@ -ARG BASE_IMAGE=reg.docker.alibaba-inc.com/modelscope/ubuntu:20.04-cuda11.3.0-cudnn8-devel +ARG BASE_IMAGE={base_image} FROM $BASE_IMAGE ARG DEBIAN_FRONTEND=noninteractive ENV TZ=Asia/Shanghai -ENV CONDA_DIR /opt/conda -ENV PATH="${CONDA_DIR}/bin:${PATH}" ENV arch=x86_64 SHELL ["/bin/bash", "-c"] COPY docker/rcfiles /tmp/resources -COPY docker/jupyter_plugins /tmp/resources/jupyter_plugins -RUN apt-get update && apt-get install -y --reinstall ca-certificates && \ - apt-get clean && \ - cp /tmp/resources/sources.list.aliyun /etc/apt/sources.list && \ - apt-get update && \ - apt-get install -y locales wget git strace gdb sox libopenmpi-dev curl \ +RUN apt-get update && apt-get upgrade -y && apt-get install -y --reinstall ca-certificates && \ + apt-get install -y make apt-utils openssh-server locales wget git strace gdb sox libopenmpi-dev curl \ + iputils-ping net-tools iproute2 autoconf automake gperf libre2-dev libssl-dev \ + libtool libcurl4-openssl-dev libb64-dev libgoogle-perftools-dev patchelf \ + rapidjson-dev scons software-properties-common pkg-config unzip zlib1g-dev \ + libbz2-dev libreadline-dev libsqlite3-dev llvm libncurses5-dev libncursesw5-dev xz-utils tk-dev liblzma-dev \ + libarchive-dev libxml2-dev libnuma-dev cmake \ libgeos-dev strace vim ffmpeg libsm6 tzdata language-pack-zh-hans \ - ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build && \ + ttf-wqy-microhei ttf-wqy-zenhei xfonts-wqy libxext6 build-essential ninja-build \ + libjpeg-dev libpng-dev && \ wget https://packagecloud.io/github/git-lfs/packages/debian/bullseye/git-lfs_3.2.0_amd64.deb/download -O ./git-lfs_3.2.0_amd64.deb && \ dpkg -i ./git-lfs_3.2.0_amd64.deb && \ rm -f ./git-lfs_3.2.0_amd64.deb && \ @@ -27,118 +27,189 @@ RUN apt-get update && apt-get install -y --reinstall ca-certificates && \ rm -rf /var/lib/apt/lists/* ENV LANG=zh_CN.UTF-8 LANGUAGE=zh_CN.UTF-8 LC_ALL=zh_CN.UTF-8 +RUN wget -O /tmp/boost.tar.gz https://boostorg.jfrog.io/artifactory/main/release/1.80.0/source/boost_1_80_0.tar.gz && \ + cd /tmp && tar xzf boost.tar.gz && \ + mv /tmp/boost_1_80_0/boost /usr/include/boost && \ + rm -rf /tmp/boost_1_80_0 && rm -rf boost.tar.gz + +#install and config python copy from https://github.com/docker-library/python/blob/1b7a1106674a21e699b155cbd53bf39387284cca/3.10/bookworm/Dockerfile +ARG PYTHON_VERSION={python_version} +ENV PATH /usr/local/bin:$PATH +ENV GPG_KEY A035C8C19219BA821ECEA86B64E628F8D684696D +ENV PYTHON_VERSION {python_version} + +RUN set -eux; \ + \ + wget -O python.tar.xz "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz"; \ + wget -O python.tar.xz.asc "https://www.python.org/ftp/python/${PYTHON_VERSION%%[a-z]*}/Python-$PYTHON_VERSION.tar.xz.asc"; \ + GNUPGHOME="$(mktemp -d)"; export GNUPGHOME; \ + gpg --batch --keyserver hkps://keys.openpgp.org --recv-keys "$GPG_KEY"; \ + gpg --batch --verify python.tar.xz.asc python.tar.xz; \ + gpgconf --kill all; \ + rm -rf "$GNUPGHOME" python.tar.xz.asc; \ + mkdir -p /usr/src/python; \ + tar --extract --directory /usr/src/python --strip-components=1 --file python.tar.xz; \ + rm python.tar.xz; \ + \ + cd /usr/src/python; \ + gnuArch="$(dpkg-architecture --query DEB_BUILD_GNU_TYPE)"; \ + ./configure \ + --build="$gnuArch" \ + --enable-loadable-sqlite-extensions \ + --enable-optimizations \ + --enable-option-checking=fatal \ + --enable-shared \ + --with-lto \ + --with-system-expat \ + --without-ensurepip \ + ; \ + nproc="$(nproc)"; \ + EXTRA_CFLAGS="$(dpkg-buildflags --get CFLAGS)"; \ + LDFLAGS="$(dpkg-buildflags --get LDFLAGS)"; \ + make -j "$nproc" \ + "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \ + "LDFLAGS=${LDFLAGS:-}" \ + "PROFILE_TASK=${PROFILE_TASK:-}" \ + ; \ +# https://github.com/docker-library/python/issues/784 +# prevent accidental usage of a system installed libpython of the same version + rm python; \ + make -j "$nproc" \ + "EXTRA_CFLAGS=${EXTRA_CFLAGS:-}" \ + "LDFLAGS=${LDFLAGS:--Wl},-rpath='\$\$ORIGIN/../lib'" \ + "PROFILE_TASK=${PROFILE_TASK:-}" \ + python \ + ; \ + make install; \ + \ +# enable GDB to load debugging data: https://github.com/docker-library/python/pull/701 + bin="$(readlink -ve /usr/local/bin/python3)"; \ + dir="$(dirname "$bin")"; \ + mkdir -p "/usr/share/gdb/auto-load/$dir"; \ + cp -vL Tools/gdb/libpython.py "/usr/share/gdb/auto-load/$bin-gdb.py"; \ + \ + cd /; \ + rm -rf /usr/src/python; \ + \ + find /usr/local -depth \ + \( \ + \( -type d -a \( -name test -o -name tests -o -name idle_test \) \) \ + -o \( -type f -a \( -name '*.pyc' -o -name '*.pyo' -o -name 'libpython*.a' \) \) \ + \) -exec rm -rf '{}' + \ + ; \ + \ + ldconfig; \ + \ + python3 --version + +# make some useful symlinks that are expected to exist ("/usr/local/bin/python" and friends) +RUN set -eux; \ + for src in idle3 pydoc3 python3 python3-config; do \ + dst="$(echo "$src" | tr -d 3)"; \ + [ -s "/usr/local/bin/$src" ]; \ + [ ! -e "/usr/local/bin/$dst" ]; \ + ln -svT "$src" "/usr/local/bin/$dst"; \ + done -#install and config python -ARG PYTHON_VERSION=3.7.13 -# Miniconda3-py37_23.1.0-1-Linux-x86_64.sh is last python3.7 version -RUN if [ "$PYTHON_VERSION" = "3.7.13" ] ; then \ - wget --quiet https://mirrors.aliyun.com/anaconda/miniconda/Miniconda3-py37_23.1.0-1-Linux-x86_64.sh -O ./miniconda.sh && \ - /bin/bash miniconda.sh -b -p /opt/conda && \ - rm -f miniconda.sh && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - cp /tmp/resources/conda.tuna ~/.condarc && \ - source /root/.bashrc && \ - conda install --yes python==${PYTHON_VERSION} && \ - pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \ - pip config set install.trusted-host mirrors.aliyun.com;\ -else \ - wget --quiet https://mirrors.aliyun.com/anaconda/miniconda/Miniconda3-latest-Linux-${arch}.sh -O ./miniconda.sh && \ - /bin/bash miniconda.sh -b -p /opt/conda && \ - rm -f miniconda.sh && \ - ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh && \ - echo ". /opt/conda/etc/profile.d/conda.sh" >> ~/.bashrc && \ - cp /tmp/resources/conda.tuna ~/.condarc && \ - source /root/.bashrc && \ - conda install --yes python==${PYTHON_VERSION} && \ - pip config set global.index-url https://mirrors.aliyun.com/pypi/simple && \ - pip config set install.trusted-host mirrors.aliyun.com;\ -fi - -ARG USE_GPU=True +# if this is called "PIP_VERSION", pip explodes with "ValueError: invalid truth value ''" +ENV PYTHON_PIP_VERSION 23.0.1 +# https://github.com/docker-library/python/issues/365 +ENV PYTHON_SETUPTOOLS_VERSION 65.5.1 +# https://github.com/pypa/get-pip +ENV PYTHON_GET_PIP_URL https://github.com/pypa/get-pip/raw/dbf0c85f76fb6e1ab42aa672ffca6f0a675d9ee4/public/get-pip.py +ENV PYTHON_GET_PIP_SHA256 dfe9fd5c28dc98b5ac17979a953ea550cec37ae1b47a5116007395bfacff2ab9 + +RUN set -eux; \ + \ + wget -O get-pip.py "$PYTHON_GET_PIP_URL"; \ + echo "$PYTHON_GET_PIP_SHA256 *get-pip.py" | sha256sum -c -; \ + \ + export PYTHONDONTWRITEBYTECODE=1; \ + \ + python get-pip.py \ + --disable-pip-version-check \ + --no-cache-dir \ + --no-compile \ + "pip==$PYTHON_PIP_VERSION" \ + "setuptools==$PYTHON_SETUPTOOLS_VERSION" \ + ; \ + rm -f get-pip.py; \ + \ + pip --version +# end of install python + +ARG USE_GPU={use_gpu} # install pytorch -ARG TORCH_VERSION=1.12.0 -ARG CUDATOOLKIT_VERSION=cu117 -RUN if [ "$USE_GPU" = "True" ] ; then \ - pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/$CUDATOOLKIT_VERSION; \ - else \ - pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \ - fi +ARG TORCH_VERSION={torch_version} +ARG CUDATOOLKIT_VERSION={cudatoolkit_version} -# install tensorflow -ARG TENSORFLOW_VERSION=1.15.5 RUN if [ "$USE_GPU" = "True" ] ; then \ - if [ "$TENSORFLOW_VERSION" = "1.15.5" ] ; then \ - pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ - else \ - pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \ - fi \ + pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio; \ else \ - # only python 3.7 has tensorflow 1.15.5 - if [ "$PYTHON_VERSION" = "3.7.13" ] ; then \ - pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \ - elif [ "$TENSORFLOW_VERSION" = "1.15.5" ] ; then \ - pip install --no-cache-dir numpy==1.18.5 https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/tensorflow-1.15.5-cp38-cp38-linux_x86_64.whl; \ - else \ - pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \ - fi \ + pip install --no-cache-dir torch==$TORCH_VERSION torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cpu; \ fi -# mmcv-full<=1.7.0 for mmdet3d compatible -RUN if [ "$USE_GPU" = "True" ] ; then \ - CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="5.0 5.2 6.0 6.1 7.0 7.5 8.0 8.6" MMCV_WITH_OPS=1 MAX_JOBS=8 FORCE_CUDA=1 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \ - else \ - MMCV_WITH_OPS=1 MAX_JOBS=8 pip install --no-cache-dir 'mmcv-full<=1.7.0' && pip cache purge; \ - fi # default shell bash ENV SHELL=/bin/bash # install special package RUN if [ "$USE_GPU" = "True" ] ; then \ - pip install dgl -f https://data.dgl.ai/wheels/$CUDATOOLKIT_VERSION/repo.html; \ + pip install --no-cache-dir dgl -f https://data.dgl.ai/wheels/$CUDATOOLKIT_VERSION/repo.html; \ else \ - pip install --no-cache-dir dgl==0.9.0 dglgo -f https://data.dgl.ai/wheels/repo.html; \ + pip install --no-cache-dir dgl dglgo -f https://data.dgl.ai/wheels/repo.html; \ fi # copy install scripts COPY docker/scripts/install_unifold.sh docker/scripts/install_colmap.sh docker/scripts/install_pytorch3d_nvdiffrast.sh docker/scripts/install_tiny_cuda_nn.sh docker/scripts/install_apex.sh /tmp/ -# for uniford -RUN if [ "$USE_GPU" = "True" ] ; then \ - bash /tmp/install_unifold.sh; \ - else \ - echo 'cpu unsupport uniford'; \ - fi - -RUN if [ "$USE_GPU" = "True" ] ; then \ - export TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6+PTX" && pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \ - else \ - echo 'cpu unsupport Pointnet2'; \ - fi - # 3d supports RUN if [ "$USE_GPU" = "True" ] ; then \ bash /tmp/install_colmap.sh; \ else \ echo 'cpu unsupport colmap'; \ fi +# install pytorch3d RUN if [ "$USE_GPU" = "True" ] ; then \ - bash /tmp/install_tiny_cuda_nn.sh \ + bash /tmp/install_pytorch3d_nvdiffrast.sh; \ else \ - echo 'cpu unsupport tiny_cudann'; \ + echo 'cpu unsupport pytorch3d nvdiffrast'; \ fi + +# for uniford RUN if [ "$USE_GPU" = "True" ] ; then \ - bash /tmp/install_pytorch3d_nvdiffrast.sh; \ + bash /tmp/install_unifold.sh; \ else \ - echo 'cpu unsupport pytorch3d nvdiffrast'; \ + echo 'cpu unsupport uniford'; \ fi -# end of 3D -# install apex after deepspeed + RUN if [ "$USE_GPU" = "True" ] ; then \ - bash /tmp/install_apex.sh; \ + export TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.9;9.0;8.6+PTX" && pip install --no-cache-dir git+https://github.com/gxd1994/Pointnet2.PyTorch.git@master#subdirectory=pointnet2; \ else \ - echo 'cpu unsupport apex'; \ + echo 'cpu unsupport Pointnet2'; \ fi + +ARG TENSORFLOW_VERSION={tf_version} + RUN if [ "$USE_GPU" = "True" ] ; then \ + pip install --no-cache-dir tensorflow==$TENSORFLOW_VERSION; \ + else \ + echo 'cpu not install tensorflow'; \ + fi + + RUN if [ "$USE_GPU" = "True" ] ; then \ + cd /tmp && git clone -b ms_build --single-branch https://github.com/tastelikefeet/mmcv.git && cd mmcv && TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.9;9.0;8.6+PTX" MMCV_WITH_OPS=1 MAX_JOBS=32 FORCE_CUDA=1 pip install . && cd / && rm -fr /tmp/mmcv && pip cache purge; \ + else \ + cd /tmp && git clone -b ms_build --single-branch https://github.com/tastelikefeet/mmcv.git && cd mmcv && MMCV_WITH_OPS=1 MAX_JOBS=32 pip install . && cd / && rm -fr /tmp/mmcv && pip cache purge; \ + fi + + # This limits the cuda121 version + RUN if [ "$USE_GPU" = "True" ] ; then \ + pip install --no-cache-dir --force tinycudann==1.7 -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html; \ + else \ + echo 'cpu not install tinycudann'; \ + fi + + RUN pip install --no-cache-dir fairseq + ENTRYPOINT [] diff --git a/docker/build_image.py b/docker/build_image.py new file mode 100644 index 000000000..344fc9d37 --- /dev/null +++ b/docker/build_image.py @@ -0,0 +1,365 @@ +import argparse +import os +from datetime import datetime +from typing import Any + +docker_registry = os.environ['DOCKER_REGISTRY'] +assert docker_registry, 'You must pass a valid DOCKER_REGISTRY' +timestamp = datetime.now() +formatted_time = timestamp.strftime('%Y%m%d%H%M%S') + + +class Builder: + + def __init__(self, args: Any, dry_run: bool): + self.args = self.init_args(args) + self.dry_run = dry_run + self.args.cudatoolkit_version = self._generate_cudatoolkit_version( + args.cuda_version) + self.args.python_tag = self._generate_python_tag(args.python_version) + + def init_args(self, args: Any) -> Any: + if not args.base_image: + # A mirrored image of nvidia/cuda:12.4.0-devel-ubuntu22.04 + args.base_image = 'nvidia/cuda:12.1.0-devel-ubuntu22.04' + if not args.torch_version: + args.torch_version = '2.3.1' + args.torchaudio_version = '2.3.1' + args.torchvision_version = '0.18.1' + if not args.tf_version: + args.tf_version = '2.16.1' + if not args.cuda_version: + args.cuda_version = '12.1.0' + if not args.vllm_version: + args.vllm_version = '0.5.3' + if not args.lmdeploy_version: + args.lmdeploy_version = '0.6.2' + if not args.autogptq_version: + args.autogptq_version = '0.7.1' + return args + + def _generate_cudatoolkit_version(self, cuda_version: str) -> str: + cuda_version = cuda_version[:cuda_version.rfind('.')] + return 'cu' + cuda_version.replace('.', '') + + def _generate_python_tag(self, python_version: str) -> str: + python_version = python_version[:python_version.rfind('.')] + return 'py' + python_version.replace('.', '') + + def generate_dockerfile(self) -> str: + raise NotImplementedError + + def _save_dockerfile(self, content: str) -> None: + if os.path.exists('./Dockerfile'): + os.remove('./Dockerfile') + with open('./Dockerfile', 'w') as f: + f.write(content) + + def build(self) -> int: + pass + + def push(self) -> int: + pass + + def image(self) -> str: + pass + + def __call__(self): + content = self.generate_dockerfile() + self._save_dockerfile(content) + if not self.dry_run: + ret = self.build() + if ret != 0: + raise RuntimeError(f'Docker build error with errno: {ret}') + + ret = self.push() + if ret != 0: + raise RuntimeError(f'Docker push error with errno: {ret}') + + if self.args.ci_image != 0: + ret = os.system( + f'docker tag {self.image()} {docker_registry}:ci_image') + if ret != 0: + raise RuntimeError( + f'Docker tag ci_image error with errno: {ret}') + + +class BaseCPUImageBuilder(Builder): + + def generate_dockerfile(self) -> str: + with open('docker/Dockerfile.ubuntu_base', 'r') as f: + content = f.read() + content = content.replace('{base_image}', self.args.base_image) + content = content.replace('{use_gpu}', 'False') + content = content.replace('{python_version}', self.args.python_version) + content = content.replace('{torch_version}', self.args.torch_version) + content = content.replace('{cudatoolkit_version}', + self.args.cudatoolkit_version) + content = content.replace('{tf_version}', self.args.tf_version) + return content + + def image(self) -> str: + return ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-' + f'torch{self.args.torch_version}-base') + + def build(self): + return os.system( + f'DOCKER_BUILDKIT=0 docker build -t {self.image()} -f Dockerfile .' + ) + + def push(self): + return os.system(f'docker push {self.image()}') + + +class BaseGPUImageBuilder(Builder): + + def generate_dockerfile(self) -> str: + with open('docker/Dockerfile.ubuntu_base', 'r') as f: + content = f.read() + content = content.replace('{base_image}', self.args.base_image) + content = content.replace('{use_gpu}', 'True') + content = content.replace('{python_version}', self.args.python_version) + content = content.replace('{torch_version}', self.args.torch_version) + content = content.replace('{cudatoolkit_version}', + self.args.cudatoolkit_version) + content = content.replace('{tf_version}', self.args.tf_version) + return content + + def image(self) -> str: + return ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-' + f'torch{self.args.torch_version}-tf{self.args.tf_version}-base') + + def build(self) -> int: + return os.system( + f'DOCKER_BUILDKIT=0 docker build -t {self.image()} -f Dockerfile .' + ) + + def push(self): + return os.system(f'docker push {self.image()}') + + +class CPUImageBuilder(Builder): + + def generate_dockerfile(self) -> str: + meta_file = './docker/install_cpu.sh' + version_args = ( + f'{self.args.torch_version} {self.args.torchvision_version} ' + f'{self.args.torchaudio_version}') + base_image = ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}' + f'-torch{self.args.torch_version}-base') + extra_content = '' + + with open('docker/Dockerfile.ubuntu', 'r') as f: + content = f.read() + content = content.replace('{base_image}', base_image) + content = content.replace('{extra_content}', extra_content) + content = content.replace('{meta_file}', meta_file) + content = content.replace('{version_args}', version_args) + content = content.replace('{install_ms_deps}', 'True') + content = content.replace('{torch_version}', + self.args.torch_version) + content = content.replace('{torchvision_version}', + self.args.torchvision_version) + content = content.replace('{torchaudio_version}', + self.args.torchaudio_version) + content = content.replace( + '{index_url}', + '--index-url https://download.pytorch.org/whl/cpu') + content = content.replace('{modelscope_branch}', + self.args.modelscope_branch) + content = content.replace('{swift_branch}', self.args.swift_branch) + return content + + def image(self) -> str: + return ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-' + f'torch{self.args.torch_version}-{self.args.modelscope_version}-test' + ) + + def build(self) -> int: + return os.system(f'docker build -t {self.image()} -f Dockerfile .') + + def push(self): + ret = os.system(f'docker push {self.image()}') + if ret != 0: + return ret + image_tag2 = ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-{self.args.python_tag}-' + f'torch{self.args.torch_version}-{self.args.modelscope_version}-{formatted_time}-test' + ) + ret = os.system(f'docker tag {self.image()} {image_tag2}') + if ret != 0: + return ret + return os.system(f'docker push {image_tag2}') + + +class GPUImageBuilder(Builder): + + def generate_dockerfile(self) -> str: + meta_file = './docker/install.sh' + extra_content = """ +RUN pip install tf-keras==2.16.0 --no-dependencies && \ + pip install --no-cache-dir torchsde jupyterlab torchmetrics==0.11.4 basicsr pynvml shortuuid && \ + CUDA_HOME=/usr/local/cuda TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0 7.5 8.0 8.6 8.9 9.0" \ + pip install --no-cache-dir 'git+https://github.com/facebookresearch/detectron2.git' +""" + + version_args = ( + f'{self.args.torch_version} {self.args.torchvision_version} {self.args.torchaudio_version} ' + f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}' + ) + base_image = ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-{self.args.python_tag}-' + f'torch{self.args.torch_version}-tf{self.args.tf_version}-base') + with open('docker/Dockerfile.ubuntu', 'r') as f: + content = f.read() + content = content.replace('{base_image}', base_image) + content = content.replace('{extra_content}', extra_content) + content = content.replace('{meta_file}', meta_file) + content = content.replace('{version_args}', version_args) + content = content.replace('{install_ms_deps}', 'True') + content = content.replace('{torch_version}', + self.args.torch_version) + content = content.replace('{torchvision_version}', + self.args.torchvision_version) + content = content.replace('{torchaudio_version}', + self.args.torchaudio_version) + content = content.replace('{index_url}', '') + content = content.replace('{modelscope_branch}', + self.args.modelscope_branch) + content = content.replace('{swift_branch}', self.args.swift_branch) + return content + + def image(self) -> str: + return ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-' + f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-' + f'{self.args.modelscope_version}-test') + + def build(self) -> int: + return os.system(f'docker build -t {self.image()} -f Dockerfile .') + + def push(self): + ret = os.system(f'docker push {self.image()}') + if ret != 0: + return ret + image_tag2 = ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-' + f'{self.args.python_tag}-torch{self.args.torch_version}-tf{self.args.tf_version}-' + f'{self.args.modelscope_version}-{formatted_time}-test') + ret = os.system(f'docker tag {self.image()} {image_tag2}') + if ret != 0: + return ret + return os.system(f'docker push {image_tag2}') + + +class LLMImageBuilder(Builder): + + def init_args(self, args) -> Any: + if not args.base_image: + # A mirrored image of nvidia/cuda:12.4.0-devel-ubuntu22.04 + args.base_image = 'nvidia/cuda:12.4.0-devel-ubuntu22.04' + if not args.torch_version: + args.torch_version = '2.4.0' + args.torchaudio_version = '2.4.0' + args.torchvision_version = '0.19.0' + if not args.cuda_version: + args.cuda_version = '12.4.0' + if not args.vllm_version: + args.vllm_version = '0.6.3.post1' + if not args.lmdeploy_version: + args.lmdeploy_version = '0.6.2' + if not args.autogptq_version: + args.autogptq_version = '0.7.1' + return args + + def generate_dockerfile(self) -> str: + meta_file = './docker/install.sh' + with open('docker/Dockerfile.extra_install', 'r') as f: + extra_content = f.read() + extra_content = extra_content.replace('{python_version}', + self.args.python_version) + version_args = ( + f'{self.args.torch_version} {self.args.torchvision_version} {self.args.torchaudio_version} ' + f'{self.args.vllm_version} {self.args.lmdeploy_version} {self.args.autogptq_version}' + ) + with open('docker/Dockerfile.ubuntu', 'r') as f: + content = f.read() + content = content.replace('{base_image}', self.args.base_image) + content = content.replace('{extra_content}', extra_content) + content = content.replace('{meta_file}', meta_file) + content = content.replace('{version_args}', version_args) + content = content.replace('{install_ms_deps}', 'False') + content = content.replace('{torch_version}', + self.args.torch_version) + content = content.replace('{torchvision_version}', + self.args.torchvision_version) + content = content.replace('{torchaudio_version}', + self.args.torchaudio_version) + content = content.replace('{index_url}', '') + content = content.replace('{modelscope_branch}', + self.args.modelscope_branch) + content = content.replace('{swift_branch}', self.args.swift_branch) + return content + + def image(self) -> str: + return ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-' + f'{self.args.python_tag}-torch{self.args.torch_version}-{self.args.modelscope_version}-LLM-test' + ) + + def build(self) -> int: + return os.system(f'docker build -t {self.image()} -f Dockerfile .') + + def push(self): + ret = os.system(f'docker push {self.image()}') + if ret != 0: + return ret + image_tag2 = ( + f'{docker_registry}:ubuntu{self.args.ubuntu_version}-cuda{self.args.cuda_version}-' + f'{self.args.python_tag}-torch{self.args.torch_version}-' + f'{self.args.modelscope_version}-LLM-{formatted_time}-test') + ret = os.system(f'docker tag {self.image()} {image_tag2}') + if ret != 0: + return ret + return os.system(f'docker push {image_tag2}') + + +parser = argparse.ArgumentParser() +parser.add_argument('--base_image', type=str, default=None) +parser.add_argument('--image_type', type=str) +parser.add_argument('--python_version', type=str, default='3.10.14') +parser.add_argument('--ubuntu_version', type=str, default='22.04') +parser.add_argument('--torch_version', type=str, default=None) +parser.add_argument('--torchvision_version', type=str, default=None) +parser.add_argument('--cuda_version', type=str, default=None) +parser.add_argument('--ci_image', type=int, default=0) +parser.add_argument('--torchaudio_version', type=str, default=None) +parser.add_argument('--tf_version', type=str, default=None) +parser.add_argument('--vllm_version', type=str, default=None) +parser.add_argument('--lmdeploy_version', type=str, default=None) +parser.add_argument('--autogptq_version', type=str, default=None) +parser.add_argument('--modelscope_branch', type=str, default='master') +parser.add_argument('--modelscope_version', type=str, default='9.99.0') +parser.add_argument('--swift_branch', type=str, default='main') +parser.add_argument('--dry_run', type=int, default=0) + +args = parser.parse_args() + +if args.image_type.lower() == 'base_cpu': + builder_cls = BaseCPUImageBuilder +elif args.image_type.lower() == 'base_gpu': + builder_cls = BaseGPUImageBuilder +elif args.image_type.lower() == 'cpu': + builder_cls = CPUImageBuilder +elif args.image_type.lower() == 'gpu': + builder_cls = GPUImageBuilder +elif args.image_type.lower() == 'llm': + builder_cls = LLMImageBuilder +else: + raise ValueError(f'Unsupported image_type: {args.image_type}') + +builder_cls(args, args.dry_run)() diff --git a/docker/install.sh b/docker/install.sh new file mode 100644 index 000000000..3a6ffc13e --- /dev/null +++ b/docker/install.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +torch_version=${1:-2.4.0} +torchvision_version=${2:-0.19.0} +torchaudio_version=${3:-2.4.0} +vllm_version=${4:-0.6.0} +lmdeploy_version=${5:-0.6.1} +autogptq_version=${6:-0.7.1} + +pip install --no-cache-dir -U autoawq lmdeploy==$lmdeploy_version + +pip uninstall -y torch torchvision torchaudio + +pip install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version + +pip install --no-cache-dir tiktoken transformers_stream_generator bitsandbytes deepspeed torchmetrics decord optimum + +# pip install https://github.com/Dao-AILab/flash-attention/releases/download/v2.6.3/flash_attn-2.6.3+cu123torch2.4cxx11abiTRUE-cp310-cp310-linux_x86_64.whl +# find on: https://github.com/Dao-AILab/flash-attention/releases +cd /tmp && git clone https://github.com/Dao-AILab/flash-attention.git && cd flash-attention && python setup.py install && cd / && rm -fr /tmp/flash-attention && pip cache purge; + +pip install --no-cache-dir triton auto-gptq==$autogptq_version vllm==$vllm_version -U && pip cache purge + +# pip uninstall -y torch-scatter && TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6;8.9;9.0" pip install --no-cache-dir -U torch-scatter diff --git a/docker/install_cpu.sh b/docker/install_cpu.sh new file mode 100644 index 000000000..43831b92c --- /dev/null +++ b/docker/install_cpu.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +torch_version=${1:-2.4.0} +torchvision_version=${2:-0.19.0} +torchaudio_version=${3:-2.4.0} + +pip uninstall -y torch torchvision torchaudio + +pip install --no-cache-dir torch==$torch_version torchvision==$torchvision_version torchaudio==$torchaudio_version --index-url https://download.pytorch.org/whl/cpu diff --git a/docker/jupyter_plugins/jupyterlab_active_log/package.json b/docker/jupyter_plugins/jupyterlab_active_log/package.json deleted file mode 100644 index d2e0d0db1..000000000 --- a/docker/jupyter_plugins/jupyterlab_active_log/package.json +++ /dev/null @@ -1,99 +0,0 @@ -{ - "name": "jupyterlab_active_log", - "version": "0.1.0", - "description": "A JupyterLab extension.", - "keywords": [ - "jupyter", - "jupyterlab", - "jupyterlab-extension" - ], - "homepage": "https://github.com/github_username/jupyterlab_active_log", - "bugs": { - "url": "https://github.com/github_username/jupyterlab_active_log/issues" - }, - "license": "BSD-3-Clause", - "files": [ - "lib/**/*.{d.ts,eot,gif,html,jpg,js,js.map,json,png,svg,woff2,ttf}", - "style/**/*.{css,js,eot,gif,html,jpg,json,png,svg,woff2,ttf}" - ], - "main": "lib/index.js", - "types": "lib/index.d.ts", - "style": "style/index.css", - "repository": { - "type": "git", - "url": "https://github.com/github_username/jupyterlab_active_log.git" - }, - "scripts": { - "build": "jlpm build:lib && jlpm build:labextension:dev", - "build:prod": "jlpm clean && jlpm build:lib && jlpm build:labextension", - "build:labextension": "jupyter labextension build .", - "build:labextension:dev": "jupyter labextension build --development True .", - "build:lib": "tsc", - "clean": "jlpm clean:lib", - "clean:lib": "rimraf lib tsconfig.tsbuildinfo", - "clean:lintcache": "rimraf .eslintcache .stylelintcache", - "clean:labextension": "rimraf jupyterlab_active_log/labextension", - "clean:all": "jlpm clean:lib && jlpm clean:labextension && jlpm clean:lintcache", - "eslint": "jlpm eslint:check --fix", - "eslint:check": "eslint . --cache --ext .ts,.tsx", - "install:extension": "jlpm build", - "lint": "jlpm stylelint && jlpm prettier && jlpm eslint", - "lint:check": "jlpm stylelint:check && jlpm prettier:check && jlpm eslint:check", - "prettier": "jlpm prettier:base --write --list-different", - "prettier:base": "prettier \"**/*{.ts,.tsx,.js,.jsx,.css,.json,.md}\"", - "prettier:check": "jlpm prettier:base --check", - "stylelint": "jlpm stylelint:check --fix", - "stylelint:check": "stylelint --cache \"style/**/*.css\"", - "watch": "run-p watch:src watch:labextension", - "watch:src": "tsc -w", - "watch:labextension": "jupyter labextension watch ." - }, - "dependencies": { - "@jupyterlab/application": "^3.1.0" - }, - "devDependencies": { - "@jupyterlab/builder": "^3.1.0", - "@typescript-eslint/eslint-plugin": "^4.8.1", - "@typescript-eslint/parser": "^4.8.1", - "eslint": "^7.14.0", - "eslint-config-prettier": "^6.15.0", - "eslint-plugin-prettier": "^3.1.4", - "npm-run-all": "^4.1.5", - "prettier": "^2.1.1", - "rimraf": "^3.0.2", - "stylelint": "^14.3.0", - "stylelint-config-prettier": "^9.0.3", - "stylelint-config-recommended": "^6.0.0", - "stylelint-config-standard": "~24.0.0", - "stylelint-prettier": "^2.0.0", - "typescript": "~4.1.3" - }, - "sideEffects": [ - "style/*.css", - "style/index.js" - ], - "styleModule": "style/index.js", - "publishConfig": { - "access": "public" - }, - "jupyterlab": { - "extension": true, - "outputDir": "jupyterlab_active_log/labextension", - "_build": { - "load": "static/remoteEntry.eb3177c3791d7658cc12.js", - "extension": "./extension", - "style": "./style" - } - }, - "jupyter-releaser": { - "hooks": { - "before-build-npm": [ - "python -m pip install jupyterlab~=3.1", - "jlpm" - ], - "before-build-python": [ - "jlpm clean:all" - ] - } - } -} diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js b/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js deleted file mode 100644 index b70adee6b..000000000 --- a/docker/jupyter_plugins/jupyterlab_active_log/static/568.a92ae44b87625ab09aed.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[]).push([[568],{568:(t,e,a)=>{a.r(e),a.d(e,{default:()=>i});const i={id:"jupyterlab_active_log:plugin",autoStart:!0,activate:t=>{console.log("JupyterLab extension jupyterlab_active_log is activated!"),window.consts=Object.assign(Object.assign({},window.consts),{recordUrl:"https://modelscope.cn/api/v1/notebooks/activelog",timerDuration:1e4,timerParams:function(){const t=location.pathname.split("/");let e;return t.length>=2&&(e=t[1]),{site:"dsw",id:e,ext:{pathname:location.pathname}}}});const e=document.body,a=e.insertBefore(document.createElement("script"),e.firstChild);a.setAttribute("id","timer-sdk"),a.setAttribute("src","https://g.alicdn.com/alifanyi/translate-js-sdk/timer.js ")}}}}]); diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js b/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js deleted file mode 100644 index 2129fc3d0..000000000 --- a/docker/jupyter_plugins/jupyterlab_active_log/static/747.63b4c3d22bfe458b352b.js +++ /dev/null @@ -1 +0,0 @@ -"use strict";(self.webpackChunkjupyterlab_active_log=self.webpackChunkjupyterlab_active_log||[]).push([[747],{150:(e,n,t)=>{t.d(n,{Z:()=>a});var r=t(645),o=t.n(r)()((function(e){return e[1]}));o.push([e.id,"/*\n See the JupyterLab Developer Guide for useful CSS Patterns:\n\n https://jupyterlab.readthedocs.io/en/stable/developer/css.html\n*/\n",""]);const a=o},645:e=>{e.exports=function(e){var n=[];return n.toString=function(){return this.map((function(n){var t=e(n);return n[2]?"@media ".concat(n[2]," {").concat(t,"}"):t})).join("")},n.i=function(e,t,r){"string"==typeof e&&(e=[[null,e,""]]);var o={};if(r)for(var a=0;a{var r,o=function(){var e={};return function(n){if(void 0===e[n]){var t=document.querySelector(n);if(window.HTMLIFrameElement&&t instanceof window.HTMLIFrameElement)try{t=t.contentDocument.head}catch(e){t=null}e[n]=t}return e[n]}}(),a=[];function i(e){for(var n=-1,t=0;t{t.r(n);var r=t(379),o=t.n(r),a=t(150);o()(a.Z,{insert:"head",singleton:!1}),a.Z.locals}}]); diff --git a/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js b/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js deleted file mode 100644 index ec49e9734..000000000 --- a/docker/jupyter_plugins/jupyterlab_active_log/static/remoteEntry.eb3177c3791d7658cc12.js +++ /dev/null @@ -1 +0,0 @@ -var _JUPYTERLAB;(()=>{"use strict";var e,r,t={293:(e,r,t)=>{var o={"./index":()=>t.e(568).then((()=>()=>t(568))),"./extension":()=>t.e(568).then((()=>()=>t(568))),"./style":()=>t.e(747).then((()=>()=>t(747)))},a=(e,r)=>(t.R=r,r=t.o(o,e)?o[e]():Promise.resolve().then((()=>{throw new Error('Module "'+e+'" does not exist in container.')})),t.R=void 0,r),n=(e,r)=>{if(t.S){var o="default",a=t.S[o];if(a&&a!==e)throw new Error("Container initialization failed as it has already been initialized with a different share scope");return t.S[o]=e,t.I(o,r)}};t.d(r,{get:()=>a,init:()=>n})}},o={};function a(e){var r=o[e];if(void 0!==r)return r.exports;var n=o[e]={id:e,exports:{}};return t[e](n,n.exports,a),n.exports}a.m=t,a.c=o,a.n=e=>{var r=e&&e.__esModule?()=>e.default:()=>e;return a.d(r,{a:r}),r},a.d=(e,r)=>{for(var t in r)a.o(r,t)&&!a.o(e,t)&&Object.defineProperty(e,t,{enumerable:!0,get:r[t]})},a.f={},a.e=e=>Promise.all(Object.keys(a.f).reduce(((r,t)=>(a.f[t](e,r),r)),[])),a.u=e=>e+"."+{568:"a92ae44b87625ab09aed",747:"63b4c3d22bfe458b352b"}[e]+".js?v="+{568:"a92ae44b87625ab09aed",747:"63b4c3d22bfe458b352b"}[e],a.g=function(){if("object"==typeof globalThis)return globalThis;try{return this||new Function("return this")()}catch(e){if("object"==typeof window)return window}}(),a.o=(e,r)=>Object.prototype.hasOwnProperty.call(e,r),e={},r="jupyterlab_active_log:",a.l=(t,o,n,i)=>{if(e[t])e[t].push(o);else{var l,u;if(void 0!==n)for(var c=document.getElementsByTagName("script"),d=0;d{l.onerror=l.onload=null,clearTimeout(f);var a=e[t];if(delete e[t],l.parentNode&&l.parentNode.removeChild(l),a&&a.forEach((e=>e(o))),r)return r(o)},f=setTimeout(p.bind(null,void 0,{type:"timeout",target:l}),12e4);l.onerror=p.bind(null,l.onerror),l.onload=p.bind(null,l.onload),u&&document.head.appendChild(l)}},a.r=e=>{"undefined"!=typeof Symbol&&Symbol.toStringTag&&Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}),Object.defineProperty(e,"__esModule",{value:!0})},(()=>{a.S={};var e={},r={};a.I=(t,o)=>{o||(o=[]);var n=r[t];if(n||(n=r[t]={}),!(o.indexOf(n)>=0)){if(o.push(n),e[t])return e[t];a.o(a.S,t)||(a.S[t]={});var i=a.S[t],l="jupyterlab_active_log",u=[];return"default"===t&&((e,r,t,o)=>{var n=i[e]=i[e]||{},u=n[r];(!u||!u.loaded&&(1!=!u.eager?o:l>u.from))&&(n[r]={get:()=>a.e(568).then((()=>()=>a(568))),from:l,eager:!1})})("jupyterlab_active_log","0.1.0"),e[t]=u.length?Promise.all(u).then((()=>e[t]=1)):1}}})(),(()=>{var e;a.g.importScripts&&(e=a.g.location+"");var r=a.g.document;if(!e&&r&&(r.currentScript&&(e=r.currentScript.src),!e)){var t=r.getElementsByTagName("script");t.length&&(e=t[t.length-1].src)}if(!e)throw new Error("Automatic publicPath is not supported in this browser");e=e.replace(/#.*$/,"").replace(/\?.*$/,"").replace(/\/[^\/]+$/,"/"),a.p=e})(),(()=>{var e={346:0};a.f.j=(r,t)=>{var o=a.o(e,r)?e[r]:void 0;if(0!==o)if(o)t.push(o[2]);else{var n=new Promise(((t,a)=>o=e[r]=[t,a]));t.push(o[2]=n);var i=a.p+a.u(r),l=new Error;a.l(i,(t=>{if(a.o(e,r)&&(0!==(o=e[r])&&(e[r]=void 0),o)){var n=t&&("load"===t.type?"missing":t.type),i=t&&t.target&&t.target.src;l.message="Loading chunk "+r+" failed.\n("+n+": "+i+")",l.name="ChunkLoadError",l.type=n,l.request=i,o[1](l)}}),"chunk-"+r,r)}};var r=(r,t)=>{var o,n,[i,l,u]=t,c=0;if(i.some((r=>0!==e[r]))){for(o in l)a.o(l,o)&&(a.m[o]=l[o]);u&&u(a)}for(r&&r(t);c 18.06 with -# experimental enabled and DOCKER_BUILDKIT=1 -# -# If you do not use buildkit you are not going to have a good time -# -# For reference: -# https://docs.docker.com/develop/develop-images/build_enhancements/ - -# ARG BASE_IMAGE=reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 -# FROM ${BASE_IMAGE} as dev-base - -# FROM reg.docker.alibaba-inc.com/pai-dlc/pytorch-training:1.10PAI-gpu-py36-cu113-ubuntu18.04 as dev-base -FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-devel -# FROM pytorch/pytorch:1.10.0-cuda11.3-cudnn8-runtime -# config pip source -RUN mkdir /root/.pip -COPY docker/rcfiles/pip.conf.tsinghua /root/.pip/pip.conf -COPY docker/rcfiles/sources.list.aliyun /etc/apt/sources.list - -# Install essential Ubuntu packages -RUN apt-get update &&\ - apt-get install -y software-properties-common \ - build-essential \ - git \ - wget \ - vim \ - curl \ - zip \ - zlib1g-dev \ - unzip \ - pkg-config \ - libsndfile1 - -# install modelscope and its python env -WORKDIR /opt/modelscope -COPY . . -RUN pip install -r requirements.txt -# RUN --mount=type=cache,target=/opt/ccache \ -# python setup.py install - -# opencv-python-headless conflict with opencv-python installed -RUN python setup.py install \ - && pip uninstall -y opencv-python-headless - -# prepare modelscope libs -COPY docker/scripts/install_libs.sh /tmp/ -RUN bash /tmp/install_libs.sh && \ - rm -rf /tmp/install_libs.sh - -ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/modelscope/lib64 - -WORKDIR /workspace diff --git a/docker/rcfiles/conda.tuna b/docker/rcfiles/conda.tuna deleted file mode 100644 index ce8a29085..000000000 --- a/docker/rcfiles/conda.tuna +++ /dev/null @@ -1,15 +0,0 @@ -channels: - - defaults -show_channel_urls: true -default_channels: - - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/main - - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/r - - https://mirrors.tuna.tsinghua.edu.cn/anaconda/pkgs/msys2 -custom_channels: - conda-forge: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud - msys2: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud - bioconda: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud - menpo: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud - pytorch: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud - pytorch-lts: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud - simpleitk: https://mirrors.tuna.tsinghua.edu.cn/anaconda/cloud diff --git a/docker/rcfiles/pip.conf.tsinghua b/docker/rcfiles/pip.conf.tsinghua deleted file mode 100644 index 4242075a4..000000000 --- a/docker/rcfiles/pip.conf.tsinghua +++ /dev/null @@ -1,2 +0,0 @@ -[global] -index-url=https://pypi.tuna.tsinghua.edu.cn/simple diff --git a/docker/rcfiles/sources.list.aliyun b/docker/rcfiles/sources.list.aliyun deleted file mode 100644 index 1ebf4ae53..000000000 --- a/docker/rcfiles/sources.list.aliyun +++ /dev/null @@ -1,14 +0,0 @@ -deb https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse -# deb-src https://mirrors.aliyun.com/ubuntu/ focal main restricted universe multiverse - -deb https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse -# deb-src https://mirrors.aliyun.com/ubuntu/ focal-security main restricted universe multiverse - -deb https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse -# deb-src https://mirrors.aliyun.com/ubuntu/ focal-updates main restricted universe multiverse - -# deb https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse -# deb-src https://mirrors.aliyun.com/ubuntu/ focal-proposed main restricted universe multiverse - -deb https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse -# deb-src https://mirrors.aliyun.com/ubuntu/ focal-backports main restricted universe multiverse diff --git a/docker/rcfiles/ubuntu20.04_sources.tuna b/docker/rcfiles/ubuntu20.04_sources.tuna deleted file mode 100644 index a247bbfa6..000000000 --- a/docker/rcfiles/ubuntu20.04_sources.tuna +++ /dev/null @@ -1,13 +0,0 @@ -# 默认注释了源码镜像以提高 apt update 速度,如有需要可自行取消注释 -deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse -# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal main restricted universe multiverse -deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse -# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-updates main restricted universe multiverse -deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse -# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-backports main restricted universe multiverse -deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse -# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-security main restricted universe multiverse - -# 预发布软件源,不建议启用 -# deb https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse -# deb-src https://mirrors.tuna.tsinghua.edu.cn/ubuntu/ focal-proposed main restricted universe multiverse diff --git a/docker/rcfiles/ubuntu2204.aliyun b/docker/rcfiles/ubuntu2204.aliyun new file mode 100644 index 000000000..d5dce70cf --- /dev/null +++ b/docker/rcfiles/ubuntu2204.aliyun @@ -0,0 +1,10 @@ +deb http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse +#deb-src http://mirrors.aliyun.com/ubuntu/ jammy main restricted universe multiverse +deb http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse +#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-security main restricted universe multiverse +deb http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse +#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-updates main restricted universe multiverse +#deb http://mirrors.aliyun.com/ubuntu/ jammy-proposed main restricted universe multiverse +#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-proposed main restricted universe multiverse +deb http://mirrors.aliyun.com/ubuntu/ jammy-backports main restricted universe multiverse +#deb-src http://mirrors.aliyun.com/ubuntu/ jammy-backports main restricted universe multiverse diff --git a/docker/rcfiles/user.vimrc b/docker/rcfiles/user.vimrc deleted file mode 100644 index 590aca43f..000000000 --- a/docker/rcfiles/user.vimrc +++ /dev/null @@ -1,10 +0,0 @@ -set nocompatible -set encoding=utf-8 -set hlsearch -set smartindent -set ruler -set number -set ts=2 -set sw=2 -set expandtab -autocmd FileType make setlocal noexpandtab diff --git a/docker/scripts/install_apex.sh b/docker/scripts/install_apex.sh index 40d9f268f..7ecd288b4 100644 --- a/docker/scripts/install_apex.sh +++ b/docker/scripts/install_apex.sh @@ -2,6 +2,6 @@ export MAX_JOBS=16 \ && git clone https://github.com/NVIDIA/apex \ && cd apex \ && git checkout 6bd01c4b99a84648ad5e5238a959735e6936c813 \ -&& TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.6" pip install -v --disable-pip-version-check --no-cache --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \ +&& TORCH_CUDA_ARCH_LIST="6.0;6.1;6.2;7.0;7.5;8.0;8.9;9.0;8.6+PTX" pip install -v --disable-pip-version-check --no-cache --global-option="--cpp_ext" --global-option="--cuda_ext" ./ \ && cd .. \ && rm -rf apex diff --git a/docker/scripts/install_colmap.sh b/docker/scripts/install_colmap.sh index f21fca1d8..ada7077ab 100644 --- a/docker/scripts/install_colmap.sh +++ b/docker/scripts/install_colmap.sh @@ -8,7 +8,7 @@ wget -q https://cmake.org/files/v3.25/cmake-3.25.2-linux-x86_64.sh \ && export CMAKE_BUILD_PARALLEL_LEVEL=36 \ && export MAX_JOBS=16 \ && export CUDA_ARCHITECTURES="all" \ - && git clone --depth 1 --branch 3.8 https://github.com/colmap/colmap.git \ + && git clone https://github.com/colmap/colmap.git \ && cd colmap \ && mkdir build \ && cd build \ diff --git a/docker/scripts/install_flash_attension.sh b/docker/scripts/install_flash_attension.sh deleted file mode 100644 index f37e567d9..000000000 --- a/docker/scripts/install_flash_attension.sh +++ /dev/null @@ -1,4 +0,0 @@ - git clone -b v2.3.2 https://github.com/Dao-AILab/flash-attention && \ - cd flash-attention && python setup.py install && \ - cd .. && \ - rm -rf flash-attention diff --git a/docker/scripts/install_pytorch3d_nvdiffrast.sh b/docker/scripts/install_pytorch3d_nvdiffrast.sh index c7880f92d..c64ea7fb5 100644 --- a/docker/scripts/install_pytorch3d_nvdiffrast.sh +++ b/docker/scripts/install_pytorch3d_nvdiffrast.sh @@ -1,6 +1,7 @@ export CMAKE_BUILD_PARALLEL_LEVEL=36 \ && export MAX_JOBS=36 \ - && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \ + && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;8.6+PTX;87;89;90" \ + && export TORCH_CUDA_ARCH_LIST="5.0;5.2;6.0;6.1;7.0;7.5;8.0;8.6+PTX;8.7;8.9;9.0" \ && git clone --branch 2.1.0 --recursive https://github.com/NVIDIA/thrust.git \ && cd thrust \ && mkdir build \ @@ -10,7 +11,11 @@ export CMAKE_BUILD_PARALLEL_LEVEL=36 \ && cd ../.. \ && rm -rf thrust \ && pip install --no-cache-dir fvcore iopath \ - && pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \ + && curl -LO https://github.com/NVIDIA/cub/archive/2.1.0.tar.gz \ + && tar xzf 2.1.0.tar.gz \ + && export CUB_HOME=$PWD/cub-2.1.0 \ + && FORCE_CUDA=1 pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \ + && rm -fr 2.1.0.tar.gz $PWD/cub-2.1.0 \ && apt-get update \ && apt-get install -y --no-install-recommends pkg-config libglvnd0 libgl1 libglx0 libegl1 libgles2 libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev -y \ && git clone https://github.com/NVlabs/nvdiffrast.git \ diff --git a/docker/scripts/install_tiny_cuda_nn.sh b/docker/scripts/install_tiny_cuda_nn.sh index 96ae5c722..1aaa2863f 100644 --- a/docker/scripts/install_tiny_cuda_nn.sh +++ b/docker/scripts/install_tiny_cuda_nn.sh @@ -1,7 +1,6 @@ -export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=36 && export TCNN_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \ +export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=36 && export TCNN_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;89;90;86" \ && git clone --recursive https://github.com/nvlabs/tiny-cuda-nn \ && cd tiny-cuda-nn \ - && git checkout v1.6 \ && cd bindings/torch \ && python setup.py install \ && cd ../../.. \ diff --git a/docker/scripts/torch111_torch3d_nvdiffrast.sh b/docker/scripts/torch111_torch3d_nvdiffrast.sh deleted file mode 100644 index ca86b0ccf..000000000 --- a/docker/scripts/torch111_torch3d_nvdiffrast.sh +++ /dev/null @@ -1,14 +0,0 @@ -export CMAKE_BUILD_PARALLEL_LEVEL=36 && export MAX_JOBS=4 && export CMAKE_CUDA_ARCHITECTURES="50;52;60;61;70;75;80;86" \ - && pip install --no-cache-dir fvcore iopath \ - && curl -LO https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz \ - && tar xzf 1.10.0.tar.gz \ - && export CUB_HOME=$PWD/cub-1.10.0 \ - && pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable" \ - && rm -fr 1.10.0.tar.gz cub-1.10.0 \ - && apt-get update \ - && apt-get install -y --no-install-recommends pkg-config libglvnd0 libgl1 libglx0 libegl1 libgles2 libglvnd-dev libgl1-mesa-dev libegl1-mesa-dev libgles2-mesa-dev -y \ - && git clone https://github.com/NVlabs/nvdiffrast.git \ - && cd nvdiffrast \ - && pip install --no-cache-dir . \ - && cd .. \ - && rm -rf nvdiffrast diff --git a/docs/Makefile b/docs/Makefile index d0c3cbf10..a9f9d1a9b 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,4 +17,4 @@ help: # Catch-all target: route all unknown targets to Sphinx using the new # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). %: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + @$(SPHINXBUILD) -b json "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/api/modelscope.models.cv.rst b/docs/source/api/modelscope.models.cv.rst index ac52fef12..464e4535f 100644 --- a/docs/source/api/modelscope.models.cv.rst +++ b/docs/source/api/modelscope.models.cv.rst @@ -11,7 +11,6 @@ modelscope.models.cv :nosignatures: :template: classtemplate.rst - easycv_base.EasyCVBaseModel image_colorization.ddcolor.ddcolor_for_image_colorization.DDColorForImageColorization image_deblur.nafnet_for_image_deblur.NAFNetForImageDeblur image_defrcn_fewshot.defrcn_for_fewshot.DeFRCNForFewShot @@ -19,7 +18,6 @@ modelscope.models.cv image_face_fusion.image_face_fusion.ImageFaceFusion image_matching.quadtree_attention_model.QuadTreeAttentionForImageMatching image_skychange.skychange_model.ImageSkychange - language_guided_video_summarization.summarizer.ClipItVideoSummarization panorama_depth_estimation.unifuse_model.PanoramaDepthEstimation video_stabilization.DUTRAFTStabilizer.DUTRAFTStabilizer video_summarization.summarizer.PGLVideoSummarization diff --git a/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst b/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst index b5a4b0f6d..32cb97b89 100644 --- a/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst +++ b/docs/source/api/modelscope.msdatasets.dataset_cls.custom_datasets.rst @@ -13,17 +13,14 @@ modelscope.msdatasets.dataset_cls.custom_datasets EasyCVBaseDataset TorchCustomDataset - MovieSceneSegmentationDataset ImageInstanceSegmentationCocoDataset GoproImageDeblurringDataset - LanguageGuidedVideoSummarizationDataset MGeoRankingDataset RedsImageDeblurringDataset TextRankingDataset VecoDataset VideoSummarizationDataset BadImageDetectingDataset - ImageInpaintingDataset ImagePortraitEnhancementDataset ImageQualityAssessmentDegradationDataset ImageQualityAssessmentMosDataset diff --git a/docs/source/api/modelscope.pipelines.audio.rst b/docs/source/api/modelscope.pipelines.audio.rst index 71d7d13b6..4357a84bf 100644 --- a/docs/source/api/modelscope.pipelines.audio.rst +++ b/docs/source/api/modelscope.pipelines.audio.rst @@ -12,7 +12,6 @@ modelscope.pipelines.audio :template: classtemplate.rst ANSPipeline - AutomaticSpeechRecognitionPipeline InverseTextProcessingPipeline KWSFarfieldPipeline KeyWordSpottingKwsbpPipeline diff --git a/docs/source/api/modelscope.pipelines.cv.rst b/docs/source/api/modelscope.pipelines.cv.rst index b2190ef1a..68341e6d1 100644 --- a/docs/source/api/modelscope.pipelines.cv.rst +++ b/docs/source/api/modelscope.pipelines.cv.rst @@ -63,7 +63,6 @@ modelscope.pipelines.cv ImageSkychangePipeline ImageStyleTransferPipeline ImageSuperResolutionPipeline - LanguageGuidedVideoSummarizationPipeline LicensePlateDetectionPipeline LiveCategoryPipeline MaskDINOInstanceSegmentationPipeline diff --git a/docs/source/api/modelscope.pipelines.nlp.rst b/docs/source/api/modelscope.pipelines.nlp.rst index ef783db17..4ad9f9e37 100644 --- a/docs/source/api/modelscope.pipelines.nlp.rst +++ b/docs/source/api/modelscope.pipelines.nlp.rst @@ -11,7 +11,6 @@ modelscope.pipelines.nlp :nosignatures: :template: classtemplate.rst - AutomaticPostEditingPipeline CodeGeeXCodeGenerationPipeline CodeGeeXCodeTranslationPipeline ConversationalTextToSqlPipeline diff --git a/docs/source/api/modelscope.pipelines.science.rst b/docs/source/api/modelscope.pipelines.science.rst index eabb12b6d..e934a7cd8 100644 --- a/docs/source/api/modelscope.pipelines.science.rst +++ b/docs/source/api/modelscope.pipelines.science.rst @@ -10,5 +10,3 @@ modelscope.pipelines.science :toctree: generated :nosignatures: :template: classtemplate.rst - - ProteinStructurePipeline diff --git a/docs/source/api/modelscope.trainers.hooks.rst b/docs/source/api/modelscope.trainers.hooks.rst index 5fd903383..5fe5e3610 100644 --- a/docs/source/api/modelscope.trainers.hooks.rst +++ b/docs/source/api/modelscope.trainers.hooks.rst @@ -14,8 +14,6 @@ modelscope.trainers.hooks builder.build_hook hook.Hook priority.Priority - checkpoint_hook.CheckpointHook - checkpoint_hook.BestCkptSaverHook compression.SparsityHook evaluation_hook.EvaluationHook iter_timer_hook.IterTimerHook diff --git a/docs/source/change_log.md b/docs/source/change_log.md index 1081c148c..e8f286ac0 100644 --- a/docs/source/change_log.md +++ b/docs/source/change_log.md @@ -16,7 +16,7 @@ Second internal release. * add palm2.0 * add space model * add MPLUG model -* add dialog_intent, dialog_modeling, dialog state tracking pipleline +* add dialog_intent, dialog_modeling, dialog state tracking pipeline * add maskedlm model and fill_mask pipeline * add nli pipeline * add sentence similarity pipeline @@ -28,7 +28,7 @@ Second internal release. #### Audio * add tts pipeline -* add kws kwsbp pipline +* add kws kwsbp pipeline * add linear aec pipeline * add ans pipeline diff --git a/docs/source/command.md b/docs/source/command.md new file mode 100644 index 000000000..2d5c73fbc --- /dev/null +++ b/docs/source/command.md @@ -0,0 +1,157 @@ +# ModelScope command line usage +## Supported commands +```bash +modelscope --help +usage: modelscope [] + +positional arguments: + {download,plugin,pipeline,modelcard,model,server,login} + modelscope commands helpers + +options: + -h, --help show this help message and exit + +``` +## login +```bash +modelscope login --help +usage: modelscope [] login [-h] --token TOKEN + +options: + -h, --help show this help message and exit + --token TOKEN The Access Token for modelscope. +``` +Get access token: [我的页面](https://modelscope.cn/my/myaccesstoken)获取**SDK 令牌** + + +## download model +```bash +modelscope download --help + + usage: modelscope [] download [-h] --model MODEL [--revision REVISION] [--cache_dir CACHE_DIR] [--local_dir LOCAL_DIR] [--include [INCLUDE ...]] [--exclude [EXCLUDE ...]] [files ...] + + positional arguments: + files Specify relative path to the repository file(s) to download.(e.g 'tokenizer.json', 'onnx/decoder_model.onnx'). + + options: + -h, --help show this help message and exit + --model MODEL The model id to be downloaded. + --revision REVISION Revision of the model. + --cache_dir CACHE_DIR + Cache directory to save model. + --local_dir LOCAL_DIR + File will be downloaded to local location specified bylocal_dir, in this case, cache_dir parameter will be ignored. + --include [INCLUDE ...] + Glob patterns to match files to download.Ignored if file is specified + --exclude [EXCLUDE ...] + Glob patterns to exclude from files to download.Ignored if file is specified +``` +## Usage Examples + +Command Examples([gpt2](https://www.modelscope.cn/models/AI-ModelScope/gpt2/files)) + +### Specify downloading of a single file +```bash + modelscope download --model 'AI-ModelScope/gpt2' 64.tflite +``` + +### Specify multiple files to download +```bash + modelscope download --model 'AI-ModelScope/gpt2' 64.tflite config.json +``` +### Specify certain files to download  +```bash + modelscope download --model 'AI-ModelScope/gpt2' --include 'onnx/*' '*.tflite' +``` +### Filter specified files +```bash + modelscope download --model 'AI-ModelScope/gpt2' --exclude 'onnx/*' '*.tflite'  +``` +### Specify the download cache directory +```bash + modelscope download --model 'AI-ModelScope/gpt2' --include '*.json' --cache_dir './cache_dir' +``` +   The model files will be downloaded to cache\_dir/AI-ModelScope/gpt2/ + +### Specify the local directory for downloading     +```bash + modelscope download --model 'AI-ModelScope/gpt2' --include '*.json' --cache_dir './local_dir' +``` +  The model files will be downloaded to ./local\_dir + +If both the local directory and the cache directory are specified, the local directory will take precedence. + +## model operation +Supports creating models and uploading model files. +```bash +modelscope model --help +usage: modelscope [] modelcard [-h] [-tk ACCESS_TOKEN] -act {create,upload,download} [-gid GROUP_ID] -mid MODEL_ID [-vis VISIBILITY] [-lic LICENSE] [-ch CHINESE_NAME] [-md MODEL_DIR] [-vt VERSION_TAG] [-vi VERSION_INFO] + +options: + -h, --help show this help message and exit + -tk ACCESS_TOKEN, --access_token ACCESS_TOKEN + the certification of visit ModelScope + -act {create,upload,download}, --action {create,upload,download} + the action of api ModelScope[create, upload] + -gid GROUP_ID, --group_id GROUP_ID + the group name of ModelScope, eg, damo + -mid MODEL_ID, --model_id MODEL_ID + the model name of ModelScope + -vis VISIBILITY, --visibility VISIBILITY + the visibility of ModelScope[PRIVATE: 1, INTERNAL:3, PUBLIC:5] + -lic LICENSE, --license LICENSE + the license of visit ModelScope[Apache License 2.0|GPL-2.0|GPL-3.0|LGPL-2.1|LGPL-3.0|AFL-3.0|ECL-2.0|MIT] + -ch CHINESE_NAME, --chinese_name CHINESE_NAME + the chinese name of ModelScope + -md MODEL_DIR, --model_dir MODEL_DIR + the model_dir of configuration.json + -vt VERSION_TAG, --version_tag VERSION_TAG + the tag of uploaded model + -vi VERSION_INFO, --version_info VERSION_INFO + the info of uploaded model +``` + +### Create model +```bash + modelscope model -act create -gid 'YOUR_GROUP_ID' -mid 'THE_MODEL_ID' -vis 1 -lic 'MIT' -ch '中文名字' +``` +Will create model THE_MODEL_ID in www.modelscope.cn + +### Upload model files +```bash + modelscope model -act upload -gid 'YOUR_GROUP_ID' -mid 'THE_MODEL_ID' -md modelfiles/ -vt 'v0.0.1' -vi 'upload model files' +``` + +## Pipeline +Create the template files needed for pipeline. + +```bash +modelscope pipeline --help +usage: modelscope [] pipeline [-h] -act {create} [-tpl TPL_FILE_PATH] [-s SAVE_FILE_PATH] [-f FILENAME] -t TASK_NAME [-m MODEL_NAME] [-p PREPROCESSOR_NAME] [-pp PIPELINE_NAME] [-config CONFIGURATION_PATH] + +options: + -h, --help show this help message and exit + -act {create}, --action {create} + the action of command pipeline[create] + -tpl TPL_FILE_PATH, --tpl_file_path TPL_FILE_PATH + the template be selected for ModelScope[template.tpl] + -s SAVE_FILE_PATH, --save_file_path SAVE_FILE_PATH + the name of custom template be saved for ModelScope + -f FILENAME, --filename FILENAME + the init name of custom template be saved for ModelScope + -t TASK_NAME, --task_name TASK_NAME + the unique task_name for ModelScope + -m MODEL_NAME, --model_name MODEL_NAME + the class of model name for ModelScope + -p PREPROCESSOR_NAME, --preprocessor_name PREPROCESSOR_NAME + the class of preprocessor name for ModelScope + -pp PIPELINE_NAME, --pipeline_name PIPELINE_NAME + the class of pipeline name for ModelScope + -config CONFIGURATION_PATH, --configuration_path CONFIGURATION_PATH + the path of configuration.json for ModelScope +``` + +### Create pipeline files +```bash + modelscope pipeline -act 'create' -t 'THE_PIPELINE_TASK' -m 'THE_MODEL_NAME' -pp 'THE_PIPELINE_NAME' +``` diff --git a/docs/source/develop.md b/docs/source/develop.md index af8ea5e75..c2fde1e63 100644 --- a/docs/source/develop.md +++ b/docs/source/develop.md @@ -119,7 +119,7 @@ git lfs install 2. We use a public read model repository from ModelScope to store test data. The repository has been added by default as a submodule with the path data/test. To clone it, use the following command: ```shell -git clone git@github.com:modelscope/modelscope.git --recursive +git clone https://github.com/modelscope/modelscope.git --recursive ``` 3. Each time you add new data, go to the data/test directory (note that you are now in the submodule's git directory), check if you are on the master branch, and pull the latest master branch: diff --git a/docs/source/develop_cn.md b/docs/source/develop_cn.md index e342b43a5..224df8f47 100644 --- a/docs/source/develop_cn.md +++ b/docs/source/develop_cn.md @@ -90,8 +90,7 @@ git lfs install 2. 我们使用 ModelScope 的一个公共读取模型仓库来存储测试数据。该仓库已默认添加为子模块,路径为 data/test。要克隆它,请使用以下命令: ``` - -git clone git@github.com:modelscope/modelscope.git --recursive +git clone https://github.com/modelscope/modelscope.git --recursive ``` 3. 每次添加新数据时,进入 data/test 目录(注意此时您已在子模块的 git 目录中),检查是否在 master 分支上,并拉取最新的 master 分支: diff --git a/docs/source/server.md b/docs/source/server.md new file mode 100644 index 000000000..150f56860 --- /dev/null +++ b/docs/source/server.md @@ -0,0 +1,41 @@ +# modelscope server使用 +## 1. 通用服务 +modelscope库基于fastapi开发一个简单模型服务,可以通过一条命令拉起绝大多数模型 +使用方法: + +```bash +modelscope server --model_id=modelscope/Llama-2-7b-chat-ms --revision=v1.0.5 +``` +我们提供的官方镜像中也可以一个命令启动(镜像还未完成) +```bash +docker run --rm --name maas_dev --shm-size=50gb --gpus='"device=0"' -e MODELSCOPE_CACHE=/modelscope_cache -v /host_path_to_modelscope_cache:/modelscope_cache -p 8000:8000 reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu22.04-cuda11.8.0-py310-torch2.1.0-tf2.14.0-1.9.5-server modelscope server --model_id=modelscope/Llama-2-7b-chat-ms --revision=v1.0.5 +``` +服务默认监听8000端口,您也可以通过--port改变端口,默认服务提供两个接口,接口文档您可以通过 +http://ip:port/docs查看 +通过describe接口,可以获取服务输入输出信息以及输入sample数据,如下图: +![describe](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/describe.jpg) +服务调用接口,可以直接拷贝describe接口example示例数据,如下图: +![call](https://modelscope.oss-cn-beijing.aliyuncs.com/resource/call.jpg) + +## 2. vllm大模型推理 +对于LLM我们提供了vllm推理支持,目前只有部分模型支持vllm。 + +### 2.1 vllm直接支持modelscope模型 +可以通过设置环境变量使得vllm从www.modelscope.cn下载模型。 + +启动普通server +```bash +VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.api_server --model="damo/nlp_gpt2_text-generation_english-base" --revision="v1.0.0" +``` +启动openai兼容接口 +```bash +VLLM_USE_MODELSCOPE=True python -m vllm.entrypoints.openai.api_server --model="damo/nlp_gpt2_text-generation_english-base" --revision="v1.0.0" +``` + +如果模型在modelscope cache目录已经存在,则会直接使用cache中的模型,否则会从www.modelscope.cn下载模型。 + +通过modelscope官方镜像启动vllm,指定端口为9090 + +```bash +docker run --rm --name maas_dev --shm-size=50gb --gpus='"device=0"' -e MODELSCOPE_CACHE=/modelscope_cache -v /host_path_to_modelscope_cache:/modelscope_cache -p 9090:9090 reg.docker.alibaba-inc.com/modelscope/modelscope:ubuntu22.04-cuda11.8.0-py310-torch2.1.0-tf2.14.0-1.9.5-server python -m vllm.entrypoints.api_server --model "modelscope/Llama-2-7b-chat-ms" --revision "v1.0.5" --port 9090 +``` diff --git a/examples/apps/llm_riddles/README_CN.md b/examples/apps/llm_riddles/README_CN.md index 0f85734c0..143900736 100644 --- a/examples/apps/llm_riddles/README_CN.md +++ b/examples/apps/llm_riddles/README_CN.md @@ -1,12 +1,15 @@ # 完蛋!我被LLM包围了!(LLMRiddles) ## 项目简介 -《完蛋!我被LLM包围了!》是一款智力挑战游戏。该项目利用LLM代码生成, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码,结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题,自动生成了对应的游戏代码,创造了一个独特的游戏体验。在这个游戏中,玩家需要巧妙构造问题,挑战LLM给出满足特定条件的回答。 +《完蛋!我被LLM包围了!》是一款智力挑战游戏。该项目利用LLM代码生成, 基于ModelScope社区内现有的LLM对话Gradio应用程序代码,结合知乎文章[《如何用“不可能”完成任务》](https://zhuanlan.zhihu.com/p/665393240)中的预设问题,自动生成了对应的游戏代码,创造了一个独特的游戏体验。在这个游戏中,玩家需要巧妙构造问题,挑战LLM给出满足特定条件的回答。 ## 更新 -2023.11.9 新增两道题目, 新增chatglm-turbo模型🔥 🔥🔥 + +2023.11.9 新增两道题目, 新增chatglm-turbo模型🔥🔥🔥 + 2023.11.7 发布初版demo🔥 + 2023.11.8 拆分关卡模块和llm,支持关卡独立接入,llm独立接入, 欢迎PR 🔥 🔥 ## 开始游戏 @@ -16,6 +19,7 @@ [LLMRiddles](https://modelscope.cn/studios/LLMRiddles/LLMRiddles/summary) ### 本地运行 + 要开始游戏,请按照以下步骤操作: 1. 克隆项目代码: @@ -28,6 +32,7 @@ 5. 执行启动命令`python app.py`. ## RoadMap + - [x] 初版本源码和创空间体验ready - [x] 支持自定义问题和验证逻辑接入 - [ ] 扩充到9个大关卡,每个关卡9个问题 @@ -35,6 +40,7 @@ - [ ] 支持云端API和本地推理切换 ## 贡献指南 + 我们欢迎大家为《完蛋!我被LLM包围了!》做出贡献,包括提出更多好玩的问题,修复validator的corner case,以及提供更多的玩法。请按以下步骤操作: 1. 访问项目地址 [ModelScope](https://github.com/modelscope/modelscope) 并fork项目。 @@ -44,13 +50,16 @@ 5. 在原项目下发起一个Pull Request。 ## 社区贡献者 + 我们诚挚感谢所有对本项目做出贡献的社区成员,特别是: - idea来源: [haoqiangfan](https://www.zhihu.com/people/haoqiang-fan) - 代码大部分来自于LLM自动生成 ## 支持 + 如果你在游戏过程中遇到任何问题或需要帮助,请通过项目的[Issues页面](https://github.com/modelscope/modelscope/issues)提交你的问题。 ## 版权和许可 + 本项目采用APACHE License许可证。请查看项目中的[LICENSE](https://github.com/modelscope/modelscope/blob/main/LICENSE)文件了解更多信息。 diff --git a/examples/apps/llm_riddles/app.py b/examples/apps/llm_riddles/app.py index 94432043c..30b6febf1 100644 --- a/examples/apps/llm_riddles/app.py +++ b/examples/apps/llm_riddles/app.py @@ -3,12 +3,15 @@ import os import random import re +import tarfile import gradio as gr +import requests from challenges.ch1 import challenge1 from challenges.ch2 import challenge2 from challenges.ch3 import challenge3 from challenges.ch4 import challenge4 +from challenges.ch5 import challenge5 from llm import create_model from PIL import Image, ImageDraw, ImageFont @@ -20,6 +23,7 @@ challenge2, challenge3, challenge4, + challenge5, ] CONGRATS_STR = '所有挑战完成!👏🏻👏🏻👏🏻👏🏻👏🏻👏🏻' @@ -156,6 +160,49 @@ def generate_share_image(state): return gr.Image.update(visible=True, value=img_pil) +def download_resource(url, extract_path='.'): + """ + 下载资源文件,解压到指定路径。 + + Args: + url: 要下载的文件的URL + extract_path: 解压文件的目标路径 + """ + try: + # 定义文件名 + filename = url.split('/')[-1] + + # 下载文件 + print(f'Downloading the file from {url}...') + response = requests.get(url, stream=True) + if response.status_code == 200: + with open(filename, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + f.write(chunk) + else: + print( + f'Error: Unable to download file. Status code: {response.status_code}' + ) + return + + # 解压文件 + print(f'Extracting the file to {extract_path}...') + if tarfile.is_tarfile(filename): + with tarfile.open(filename, 'r:*') as tar: + tar.extractall(path=extract_path) + else: + print('Error: The downloaded file is not a tar file.') + + # 删除临时文件 + print(f'Removing the temporary file {filename}...') + os.remove(filename) + print( + 'File downloaded, extracted, and temporary file removed successfully.' + ) + except Exception as e: + print(f'An error occurred: {e}') + + def create_app(): # Gradio界面构建 block = gr.Blocks() @@ -220,4 +267,8 @@ def create_app(): if __name__ == '__main__': + if not os.path.exists('assets'): + download_resource( + 'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/llm_riddles_assets.tar' + ) create_app() diff --git a/examples/apps/llm_riddles/assets/background.png b/examples/apps/llm_riddles/assets/background.png deleted file mode 100644 index 9d0cb3c92..000000000 --- a/examples/apps/llm_riddles/assets/background.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8afcec15a87bcfaff327a5c9564a31ff1fe185a63cb286bd9772c8c68216768a -size 757003 diff --git a/examples/apps/llm_riddles/assets/background0.png b/examples/apps/llm_riddles/assets/background0.png deleted file mode 100644 index 163942802..000000000 --- a/examples/apps/llm_riddles/assets/background0.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16afb18994ad0654b31117931aad2ee05863492e964e10f4c559556e29618320 -size 839643 diff --git a/examples/apps/llm_riddles/assets/background1.png b/examples/apps/llm_riddles/assets/background1.png deleted file mode 100644 index 9d0cb3c92..000000000 --- a/examples/apps/llm_riddles/assets/background1.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8afcec15a87bcfaff327a5c9564a31ff1fe185a63cb286bd9772c8c68216768a -size 757003 diff --git a/examples/apps/llm_riddles/assets/background2.png b/examples/apps/llm_riddles/assets/background2.png deleted file mode 100644 index adec77231..000000000 --- a/examples/apps/llm_riddles/assets/background2.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:966a013913042e1574ccbc299b1914272cb47df69a552bf1723b96b2d8902de3 -size 1114172 diff --git a/examples/apps/llm_riddles/assets/background3.png b/examples/apps/llm_riddles/assets/background3.png deleted file mode 100644 index 97c446d6a..000000000 --- a/examples/apps/llm_riddles/assets/background3.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5253bbed99be55e6ac9080ea320df75c95592204696d6d41ba90f9905384fdca -size 1198295 diff --git a/examples/apps/llm_riddles/assets/background4.png b/examples/apps/llm_riddles/assets/background4.png deleted file mode 100644 index fc612898c..000000000 --- a/examples/apps/llm_riddles/assets/background4.png +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4cf462f8db7583843bc152ccfc87bb033b91880c98db9f83ba87fcca5d5d07f2 -size 1056053 diff --git a/examples/apps/llm_riddles/assets/font.ttf b/examples/apps/llm_riddles/assets/font.ttf deleted file mode 100644 index 7dc813443..000000000 Binary files a/examples/apps/llm_riddles/assets/font.ttf and /dev/null differ diff --git a/examples/apps/llm_riddles/challenges/ch2.py b/examples/apps/llm_riddles/challenges/ch2.py index 5c381de66..91b990150 100644 --- a/examples/apps/llm_riddles/challenges/ch2.py +++ b/examples/apps/llm_riddles/challenges/ch2.py @@ -23,6 +23,14 @@ def get_square_root(n): return int(sympy.sqrt(n)) +# 验证函数 - 微言大义 +def validate_9(response, input): + input_yes = len(input) <= 10 + output_yes = len(response) >= 9 and response.isdigit() and sympy.isprime( + int(response)) + return input_yes and output_yes + + challenge2 = { 'name': '第二章 数字游戏', @@ -114,5 +122,10 @@ def get_square_root(n): char not in input for char in '零一二三四五六七八九十') and len( set(re.findall(r'\d', response))) == 10) }, + { + 'title': '第9题 微言大义', + 'description': '请输入10个字以内的问题,使得模型的回答是一个超过一亿的素数', + 'validator': validate_9 + } ] } diff --git a/examples/apps/llm_riddles/challenges/ch5.py b/examples/apps/llm_riddles/challenges/ch5.py new file mode 100644 index 000000000..ce918226a --- /dev/null +++ b/examples/apps/llm_riddles/challenges/ch5.py @@ -0,0 +1,35 @@ +def check_word_in_sentence(words, sentence): + return [word in sentence for word in words] + + +challenge5 = { + 'name': + '第五章 登堂入室', + 'problems': [ + { + 'title': + '第1题 盛夏少年', + 'description': + '模型的回答应该包含“盛夏”、“蝉鸣”、“少年”、“橘子味汽水”这几个词,同时输入的问题不能包含其中任一个词。', + 'validator': + lambda response, input: all( + check_word_in_sentence(['盛夏', '蝉鸣', '少年', '橘子味汽水'], response)) + and not any( + check_word_in_sentence(['盛夏', '蝉鸣', '少年', '橘子味汽水'], input)) + }, + { + 'title': + '第2题 蝉鸣日出', + 'description': + '模型的回答应该包含“盛夏”、“蝉鸣”、“少年”、“橘子味汽水”、“日出”这几个词,同时输入的问题不能包含其中任一个字。', + 'validator': + lambda response, input: all( + check_word_in_sentence( + ['盛夏', '蝉鸣', '少年', '橘子味汽水', '日出'], response)) and not any( + check_word_in_sentence([ + '盛', '夏', '蝉', '鸣', '少', '年', '橘', '子', '味', '汽', + '水', '日', '出' + ], input)) + }, + ] +} diff --git a/examples/apps/llm_riddles/check_challenge.py b/examples/apps/llm_riddles/check_challenge.py new file mode 100644 index 000000000..c8d225208 --- /dev/null +++ b/examples/apps/llm_riddles/check_challenge.py @@ -0,0 +1,28 @@ +from app import challenges, generate_response + + +def check_answer(chap_idx, + challenge_idx, + input='input', + model_name='qwen-max'): + print('第{}章 第{}题'.format(chap_idx + 1, challenge_idx + 1)) + challenge = challenges[chap_idx]['problems'][challenge_idx] + print(challenge['description']) + val_fn = challenge['validator'] + response = generate_response(input, model_name) + try: + res = val_fn(response, input) + print('input:\n', input) + print('response:\n', response) + print('validation result: ', res) + except Exception: + import traceback + traceback.print_exc() + print('failed') + + +if __name__ == '__main__': + chap = 5 + ques = 1 + input = '请使用“盛 夏”、“蝉 鸣”、“少 年”、“橘 子味汽水”这几个词造句' + check_answer(chap - 1, ques - 1, input) diff --git a/examples/pytorch/FILE_TRANSFER.md b/examples/pytorch/FILE_TRANSFER.md new file mode 100644 index 000000000..690e3bcf9 --- /dev/null +++ b/examples/pytorch/FILE_TRANSFER.md @@ -0,0 +1,3 @@ +# NOTE + +`DiT_ImageNet_Demo.ipynb`, `SiT_ImageNet_Demo.ipynb`, `ViViT-demo.ipynb`, `UViT_ImageNet_demo.ipynb` are moved to the [modelscope-classroom repo](https://github.com/modelscope/modelscope-classroom) diff --git a/examples/pytorch/application/qwen1.5_doc_search_QA_based_on_langchain.ipynb b/examples/pytorch/application/qwen1.5_doc_search_QA_based_on_langchain.ipynb new file mode 100644 index 000000000..c8ba95556 --- /dev/null +++ b/examples/pytorch/application/qwen1.5_doc_search_QA_based_on_langchain.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "a33c5c7a-6d2f-4f38-b72a-ff5f07896184", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install llama-index llama-index-llms-huggingface ipywidgets\n", + "!pip install transformers -U" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fd3b2a78-5782-4f76-8d09-52b6b07a96b8", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-21T05:49:50.997974Z", + "iopub.status.busy": "2024-02-21T05:49:50.997681Z", + "iopub.status.idle": "2024-02-21T05:49:54.378226Z", + "shell.execute_reply": "2024-02-21T05:49:54.377769Z", + "shell.execute_reply.started": "2024-02-21T05:49:50.997954Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-02-21 13:49:53,743 - modelscope - INFO - PyTorch version 2.1.2+cu121 Found.\n", + "2024-02-21 13:49:53,745 - modelscope - INFO - TensorFlow version 2.14.0 Found.\n", + "2024-02-21 13:49:53,746 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer\n", + "2024-02-21 13:49:53,746 - modelscope - INFO - No valid ast index found from /mnt/workspace/.cache/modelscope/ast_indexer, generating ast index from prebuilt!\n", + "2024-02-21 13:49:53,803 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 509123dba36c5e70a95f6780df348471 and a total number of 964 components indexed\n" + ] + } + ], + "source": [ + "import logging\n", + "import sys\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "\n", + "from IPython.display import Markdown, display\n", + "import torch\n", + "from llama_index.llms.huggingface import HuggingFaceLLM\n", + "from llama_index.core.prompts import PromptTemplate\n", + "from modelscope import snapshot_download\n", + "from llama_index.core.base.embeddings.base import BaseEmbedding, Embedding\n", + "from abc import ABC\n", + "from typing import Any, List, Optional, Dict, cast\n", + "from llama_index.core import (\n", + " VectorStoreIndex,\n", + " ServiceContext,\n", + " set_global_service_context,\n", + " SimpleDirectoryReader,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c8375e4c-21c3-433c-a7b1-945007a73ac2", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-21T05:49:57.097256Z", + "iopub.status.busy": "2024-02-21T05:49:57.096804Z", + "iopub.status.idle": "2024-02-21T05:50:38.941821Z", + "shell.execute_reply": "2024-02-21T05:50:38.941368Z", + "shell.execute_reply.started": "2024-02-21T05:49:57.097233Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Downloading: 100%|██████████| 662/662 [00:00<00:00, 6.94MB/s]\n", + "Downloading: 100%|██████████| 51.0/51.0 [00:00<00:00, 586kB/s]\n", + "Downloading: 100%|██████████| 178/178 [00:00<00:00, 2.13MB/s]\n", + "Downloading: 100%|██████████| 1.59M/1.59M [00:00<00:00, 27.9MB/s]\n", + "Downloading: 100%|█████████▉| 3.72G/3.72G [00:08<00:00, 449MB/s]\n", + "Downloading: 100%|█████████▉| 3.64G/3.64G [00:11<00:00, 336MB/s]\n", + "Downloading: 100%|██████████| 38.7k/38.7k [00:00<00:00, 40.0MB/s]\n", + "Downloading: 100%|██████████| 4.13k/4.13k [00:00<00:00, 5.90MB/s]\n", + "Downloading: 100%|██████████| 6.70M/6.70M [00:00<00:00, 121MB/s]\n", + "Downloading: 100%|██████████| 1.13k/1.13k [00:00<00:00, 12.4MB/s]\n", + "Downloading: 100%|██████████| 2.65M/2.65M [00:00<00:00, 91.6MB/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n", + "We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "875c92489c8047c7881342f422f47c79", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Loading checkpoint shards: 0%| | 0/2 [00:00>\\n\" + SYSTEM_PROMPT + \"<>\\n\\n{query_str}[/INST] \"\n", + ")\n", + "\n", + "llm = HuggingFaceLLM(\n", + " context_window=4096,\n", + " max_new_tokens=2048,\n", + " generate_kwargs={\"temperature\": 0.0, \"do_sample\": False},\n", + " query_wrapper_prompt=query_wrapper_prompt,\n", + " tokenizer_name=selected_model,\n", + " model_name=selected_model,\n", + " device_map=\"auto\",\n", + " # change these settings below depending on your GPU\n", + " model_kwargs={\"torch_dtype\": torch.float16},\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "38d1acab-e916-459b-9a11-e39a63751d47", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-21T05:51:00.938021Z", + "iopub.status.busy": "2024-02-21T05:51:00.937708Z", + "iopub.status.idle": "2024-02-21T05:51:01.687136Z", + "shell.execute_reply": "2024-02-21T05:51:01.686435Z", + "shell.execute_reply.started": "2024-02-21T05:51:00.937998Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2024-02-21 13:51:01-- https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n", + "正在解析主机 modelscope.oss-cn-beijing.aliyuncs.com (modelscope.oss-cn-beijing.aliyuncs.com)... 8.131.208.119\n", + "正在连接 modelscope.oss-cn-beijing.aliyuncs.com (modelscope.oss-cn-beijing.aliyuncs.com)|8.131.208.119|:443... 已连接。\n", + "已发出 HTTP 请求,正在等待回应... 200 OK\n", + "长度: 13228 (13K) [text/markdown]\n", + "正在保存至: ‘data/xianjiaoda/xianjiaoda.md’\n", + "\n", + "data/xianjiaoda/xia 100%[===================>] 12.92K --.-KB/s 用时 0s \n", + "\n", + "2024-02-21 13:51:01 (31.7 MB/s) - 已保存 ‘data/xianjiaoda/xianjiaoda.md’ [13228/13228])\n", + "\n" + ] + } + ], + "source": [ + "!mkdir -p 'data/xianjiaoda/'\n", + "!wget 'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md' -O 'data/xianjiaoda/xianjiaoda.md'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "75ffc74f-a732-4748-8cb8-481cd8a39f81", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# load documents\n", + "documents = SimpleDirectoryReader(\"/mnt/workspace/data/xianjiaoda/\").load_data()\n", + "documents" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5689eeaa-8d2c-4df5-9165-abde5d1b3702", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-21T05:51:07.044053Z", + "iopub.status.busy": "2024-02-21T05:51:07.043752Z", + "iopub.status.idle": "2024-02-21T05:51:07.051731Z", + "shell.execute_reply": "2024-02-21T05:51:07.051278Z", + "shell.execute_reply.started": "2024-02-21T05:51:07.044036Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n", + "class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):\n", + " embed: Any = None\n", + " model_id: str = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n", + "\n", + " def __init__(\n", + " self,\n", + " model_id: str,\n", + " **kwargs: Any,\n", + " ) -> None:\n", + " super().__init__(**kwargs)\n", + " try:\n", + " from modelscope.models import Model\n", + " from modelscope.pipelines import pipeline\n", + " from modelscope.utils.constant import Tasks\n", + " # 使用modelscope的embedding模型(包含下载)\n", + " self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)\n", + "\n", + " except ImportError as e:\n", + " raise ValueError(\n", + " \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n", + " ) from e\n", + "\n", + " def _get_query_embedding(self, query: str) -> List[float]:\n", + " text = query.replace(\"\\n\", \" \")\n", + " inputs = {\"source_sentence\": [text]}\n", + " return self.embed(input=inputs)['text_embedding'][0].tolist()\n", + "\n", + " def _get_text_embedding(self, text: str) -> List[float]:\n", + " text = text.replace(\"\\n\", \" \")\n", + " inputs = {\"source_sentence\": [text]}\n", + " return self.embed(input=inputs)['text_embedding'][0].tolist()\n", + "\n", + " def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:\n", + " texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n", + " inputs = {\"source_sentence\": texts}\n", + " return self.embed(input=inputs)['text_embedding'].tolist()\n", + "\n", + " async def _aget_query_embedding(self, query: str) -> List[float]:\n", + " return self._get_query_embedding(query)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8590cf73-bb5b-498c-993d-d24f15aad77e", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-21T05:51:09.906919Z", + "iopub.status.busy": "2024-02-21T05:51:09.906610Z", + "iopub.status.idle": "2024-02-21T05:51:17.813191Z", + "shell.execute_reply": "2024-02-21T05:51:17.812713Z", + "shell.execute_reply.started": "2024-02-21T05:51:09.906901Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:datasets:PyTorch version 2.1.2+cu121 available.\n", + "PyTorch version 2.1.2+cu121 available.\n", + "INFO:datasets:TensorFlow version 2.14.0 available.\n", + "TensorFlow version 2.14.0 available.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-02-21 13:51:10,907 - modelscope - WARNING - Model revision not specified, use revision: v1.1.0\n", + "Downloading: 100%|██████████| 917/917 [00:00<00:00, 6.18MB/s]\n", + "Downloading: 100%|██████████| 2.29k/2.29k [00:00<00:00, 23.5MB/s]\n", + "Downloading: 100%|██████████| 60.7k/60.7k [00:00<00:00, 26.3MB/s]\n", + "Downloading: 100%|██████████| 195M/195M [00:00<00:00, 383MB/s] \n", + "Downloading: 100%|██████████| 11.4k/11.4k [00:00<00:00, 40.4MB/s]\n", + "Downloading: 100%|██████████| 125/125 [00:00<00:00, 684kB/s]\n", + "Downloading: 100%|██████████| 429k/429k [00:00<00:00, 20.8MB/s]\n", + "Downloading: 100%|██████████| 366/366 [00:00<00:00, 4.25MB/s]\n", + "2024-02-21 13:51:15,095 - modelscope - INFO - initiate model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base\n", + "2024-02-21 13:51:15,096 - modelscope - INFO - initiate model from location /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base.\n", + "2024-02-21 13:51:15,096 - modelscope - INFO - initialize model from /mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base\n", + "/opt/conda/lib/python3.10/site-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly. To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n", + " return self.fget.__get__(instance, owner)()\n", + "2024-02-21 13:51:15,741 - modelscope - WARNING - No preprocessor field found in cfg.\n", + "2024-02-21 13:51:15,742 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n", + "2024-02-21 13:51:15,742 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base'}. trying to build by task and model information.\n", + "2024-02-21 13:51:15,762 - modelscope - WARNING - No preprocessor field found in cfg.\n", + "2024-02-21 13:51:15,762 - modelscope - WARNING - No val key and type key found in preprocessor domain of configuration.json file.\n", + "2024-02-21 13:51:15,763 - modelscope - WARNING - Cannot find available config to build preprocessor at mode inference, current config: {'model_dir': '/mnt/workspace/.cache/modelscope/damo/nlp_gte_sentence-embedding_chinese-base', 'sequence_length': 128}. trying to build by task and model information.\n", + "/tmp/ipykernel_442/427817804.py:2: DeprecationWarning: Call to deprecated class method from_defaults. (ServiceContext is deprecated, please use `llama_index.settings.Settings` instead.) -- Deprecated since version 0.10.0.\n", + " service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n", + "/opt/conda/lib/python3.10/site-packages/transformers/modeling_utils.py:993: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n", + "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n", + "set_global_service_context(service_context)\n", + "\n", + "index = VectorStoreIndex.from_documents(documents)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "df218d21-9ad1-42f3-b44c-47aa56f6edcf", + "metadata": { + "execution": { + "iopub.execute_input": "2024-02-21T05:51:20.557315Z", + "iopub.status.busy": "2024-02-21T05:51:20.556991Z", + "iopub.status.idle": "2024-02-21T05:51:20.610136Z", + "shell.execute_reply": "2024-02-21T05:51:20.609707Z", + "shell.execute_reply.started": "2024-02-21T05:51:20.557297Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# set Logging to DEBUG for more detailed outputs\n", + "query_engine = index.as_query_engine()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "10c8c01f-c923-4234-a93e-c37a39358f5b", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-21T05:59:18.934204Z", + "iopub.status.busy": "2024-02-21T05:59:18.933908Z", + "iopub.status.idle": "2024-02-21T05:59:19.777534Z", + "shell.execute_reply": "2024-02-21T05:59:19.777054Z", + "shell.execute_reply.started": "2024-02-21T05:59:18.934187Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Setting `pad_token_id` to `eos_token_id`:151645 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2000年国务院决定将西安交通大学、西安医科大学、陕西财经学院三校合并,组建新的西安交通大学\n" + ] + } + ], + "source": [ + "response = query_engine.query(\"西安交大是由哪几个学校合并的?\")\n", + "print(response)\n", + "#display(Markdown(f\"{response}\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb new file mode 100644 index 000000000..e6ddabfd5 --- /dev/null +++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_langchain_llamaindex.ipynb @@ -0,0 +1,326 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Usage\n", + "1. Install python dependencies\n", + "```shell\n", + "!pip install pypdf langchain unstructured transformers_stream_generator\n", + "!pip install modelscope nltk pydantic tiktoken llama-index\n", + "```\n", + "\n", + "2. Download data files we need in this example\n", + "```shell\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/averaged_perceptron_tagger.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n", + "\n", + "!mkdir -p /root/nltk_data/tokenizers\n", + "!mkdir -p /root/nltk_data/taggers\n", + "!cp /mnt/workspace/punkt.zip /root/nltk_data/tokenizers\n", + "!cp /mnt/workspace/averaged_perceptron_tagger.zip /root/nltk_data/taggers\n", + "!cd /root/nltk_data/tokenizers; unzip punkt.zip;\n", + "!cd /root/nltk_data/taggers; unzip averaged_perceptron_tagger.zip;\n", + "\n", + "!mkdir -p /mnt/workspace/custom_data\n", + "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n", + "\n", + "!cd /mnt/workspace\n", + "``` \n", + "\n", + "3. Enjoy your QA AI" + ], + "metadata": { + "collapsed": false + }, + "id": "8230365523c9330a" + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a407764-9392-48ae-9bed-8c73c9f76fbc", + "metadata": { + "execution": { + "iopub.execute_input": "2024-01-16T08:58:56.323000Z", + "iopub.status.busy": "2024-01-16T08:58:56.322690Z", + "iopub.status.idle": "2024-01-16T08:59:57.862755Z", + "shell.execute_reply": "2024-01-16T08:59:57.862041Z", + "shell.execute_reply.started": "2024-01-16T08:58:56.322980Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!pip install pypdf langchain unstructured transformers_stream_generator\n", + "!pip install modelscope nltk pydantic tiktoken llama-index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "696c6b78-53e8-4135-8376-ce8902b7d79a", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-01-16T09:04:59.193375Z", + "iopub.status.busy": "2024-01-16T09:04:59.193082Z", + "iopub.status.idle": "2024-01-16T09:05:00.971449Z", + "shell.execute_reply": "2024-01-16T09:05:00.970857Z", + "shell.execute_reply.started": "2024-01-16T09:04:59.193357Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/averaged_perceptron_tagger.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n", + "\n", + "!mkdir -p /root/nltk_data/tokenizers\n", + "!mkdir -p /root/nltk_data/taggers\n", + "!cp /mnt/workspace/punkt.zip /root/nltk_data/tokenizers\n", + "!cp /mnt/workspace/averaged_perceptron_tagger.zip /root/nltk_data/taggers\n", + "!cd /root/nltk_data/tokenizers; unzip punkt.zip;\n", + "!cd /root/nltk_data/taggers; unzip averaged_perceptron_tagger.zip;\n", + "\n", + "!mkdir -p /mnt/workspace/custom_data\n", + "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n", + "\n", + "!cd /mnt/workspace" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cb8feca-c71f-4ad6-8eff-caae95411aa0", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-01-16T09:06:03.024995Z", + "iopub.status.busy": "2024-01-16T09:06:03.024622Z", + "iopub.status.idle": "2024-01-16T09:09:15.894774Z", + "shell.execute_reply": "2024-01-16T09:09:15.894230Z", + "shell.execute_reply.started": "2024-01-16T09:06:03.024974Z" + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from abc import ABC\n", + "from typing import Any, List, Optional, Dict, cast\n", + "\n", + "import torch\n", + "from langchain_core.language_models.llms import LLM\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import ChatPromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "from modelscope import AutoModelForCausalLM, AutoTokenizer\n", + "from llama_index import GPTVectorStoreIndex, SimpleDirectoryReader\n", + "from llama_index import ServiceContext\n", + "from llama_index.embeddings.base import BaseEmbedding\n", + "from llama_index import set_global_service_context\n", + "from langchain_core.retrievers import BaseRetriever\n", + "from langchain_core.callbacks import CallbackManagerForRetrieverRun\n", + "from langchain_core.documents import Document\n", + "from llama_index.retrievers import VectorIndexRetriever\n", + "\n", + "# configs for LLM\n", + "llm_name = \"Qwen/Qwen-1_8B-Chat\"\n", + "llm_revision = \"master\"\n", + "\n", + "# configs for embedding model\n", + "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-small\"\n", + "\n", + "# file path for your custom knowledge base\n", + "knowledge_doc_file_dir = \"/mnt/workspace/custom_data/\"\n", + "knowledge_doc_file_path = knowledge_doc_file_dir + \"xianjiaoda.md\"\n", + "\n", + "\n", + "# define our Embedding class to use models in Modelscope\n", + "class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):\n", + " embed: Any = None\n", + " model_id: str = \"damo/nlp_gte_sentence-embedding_chinese-small\"\n", + "\n", + " def __init__(\n", + " self,\n", + " model_id: str,\n", + " **kwargs: Any,\n", + " ) -> None:\n", + " super().__init__(**kwargs)\n", + " try:\n", + " from modelscope.models import Model\n", + " from modelscope.pipelines import pipeline\n", + " from modelscope.utils.constant import Tasks\n", + " self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)\n", + "\n", + " except ImportError as e:\n", + " raise ValueError(\n", + " \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n", + " ) from e\n", + "\n", + " def _get_query_embedding(self, query: str) -> List[float]:\n", + " text = query.replace(\"\\n\", \" \")\n", + " inputs = {\"source_sentence\": [text]}\n", + " return self.embed(input=inputs)['text_embedding'][0]\n", + "\n", + " def _get_text_embedding(self, text: str) -> List[float]:\n", + " text = text.replace(\"\\n\", \" \")\n", + " inputs = {\"source_sentence\": [text]}\n", + " return self.embed(input=inputs)['text_embedding'][0]\n", + "\n", + " def _get_text_embeddings(self, texts: List[str]) -> List[List[float]]:\n", + " texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n", + " inputs = {\"source_sentence\": texts}\n", + " return self.embed(input=inputs)['text_embedding']\n", + "\n", + " async def _aget_query_embedding(self, query: str) -> List[float]:\n", + " return self._get_query_embedding(query)\n", + "\n", + "\n", + "# define our Retriever with llama-index to co-operate with Langchain\n", + "# note that the 'LlamaIndexRetriever' defined in langchain-community.retrievers.llama_index.py\n", + "# is no longer compatible with llamaIndex code right now.\n", + "class LlamaIndexRetriever(BaseRetriever):\n", + " index: Any\n", + " \"\"\"LlamaIndex index to query.\"\"\"\n", + "\n", + " def _get_relevant_documents(\n", + " self, query: str, *, run_manager: CallbackManagerForRetrieverRun\n", + " ) -> List[Document]:\n", + " \"\"\"Get documents relevant for a query.\"\"\"\n", + " try:\n", + " from llama_index.indices.base import BaseIndex\n", + " from llama_index.response.schema import Response\n", + " except ImportError:\n", + " raise ImportError(\n", + " \"You need to install `pip install llama-index` to use this retriever.\"\n", + " )\n", + " index = cast(BaseIndex, self.index)\n", + " print('@@@ query=', query)\n", + "\n", + " response = index.as_query_engine().query(query)\n", + " response = cast(Response, response)\n", + " # parse source nodes\n", + " docs = []\n", + " for source_node in response.source_nodes:\n", + " print('@@@@ source=', source_node)\n", + " metadata = source_node.metadata or {}\n", + " docs.append(\n", + " Document(page_content=source_node.get_text(), metadata=metadata)\n", + " )\n", + " return docs\n", + "\n", + "def torch_gc():\n", + " os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", + " DEVICE = \"cuda\"\n", + " DEVICE_ID = \"0\"\n", + " CUDA_DEVICE = f\"{DEVICE}:{DEVICE_ID}\" if DEVICE_ID else DEVICE\n", + " a = torch.Tensor([1, 2])\n", + " a = a.cuda()\n", + " print(a)\n", + "\n", + " if torch.cuda.is_available():\n", + " with torch.cuda.device(CUDA_DEVICE):\n", + " torch.cuda.empty_cache()\n", + " torch.cuda.ipc_collect()\n", + "\n", + "\n", + "# global resources used by QianWenChatLLM (this is not a good practice)\n", + "tokenizer = AutoTokenizer.from_pretrained(llm_name, revision=llm_revision, trust_remote_code=True)\n", + "model = AutoModelForCausalLM.from_pretrained(llm_name, revision=llm_revision, device_map=\"auto\",\n", + " trust_remote_code=True, fp16=True).eval()\n", + "\n", + "\n", + "# define QianWen LLM based on langchain's LLM to use models in Modelscope\n", + "class QianWenChatLLM(LLM):\n", + " max_length = 10000\n", + " temperature: float = 0.01\n", + " top_p = 0.9\n", + "\n", + " def __init__(self):\n", + " super().__init__()\n", + "\n", + " @property\n", + " def _llm_type(self):\n", + " return \"ChatLLM\"\n", + "\n", + " def _call(\n", + " self,\n", + " prompt: str,\n", + " stop: Optional[List[str]] = None,\n", + " run_manager=None,\n", + " **kwargs: Any,\n", + " ) -> str:\n", + " print(prompt)\n", + " response, history = model.chat(tokenizer, prompt, history=None)\n", + " torch_gc()\n", + " return response\n", + "\n", + "\n", + "# STEP1: create LLM instance\n", + "qwllm = QianWenChatLLM()\n", + "print('STEP1: qianwen LLM created')\n", + "\n", + "# STEP2: load knowledge file and initialize vector db by llamaIndex\n", + "print('STEP2: reading docs ...')\n", + "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n", + "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=None)\n", + "set_global_service_context(service_context) # global config, not good\n", + "\n", + "llamaIndex_docs = SimpleDirectoryReader(knowledge_doc_file_dir).load_data()\n", + "llamaIndex_index = GPTVectorStoreIndex.from_documents(llamaIndex_docs, chunk_size=512)\n", + "retriever = LlamaIndexRetriever(index=llamaIndex_index)\n", + "print(' 2.2 reading doc done, vec db created.')\n", + "\n", + "# STEP3: create chat template\n", + "prompt_template = \"\"\"请基于```内的内容回答问题。\"\n", + "```\n", + "{context}\n", + "```\n", + "我的问题是:{question}。\n", + "\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template=prompt_template)\n", + "print('STEP3: chat prompt template created.')\n", + "\n", + "# STEP4: create RAG chain to do QA\n", + "chain = (\n", + " {\"context\": retriever, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | qwllm\n", + " | StrOutputParser()\n", + ")\n", + "chain.invoke('西安交大的校训是什么?')\n", + "# chain.invoke('魔搭社区有哪些模型?')\n", + "# chain.invoke('modelscope是什么?')\n", + "# chain.invoke('萧峰和乔峰是什么关系?')\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb b/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb new file mode 100644 index 000000000..194c46a20 --- /dev/null +++ b/examples/pytorch/application/qwen_doc_search_QA_based_on_llamaindex.ipynb @@ -0,0 +1,223 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Usage\n", + "\n", + "## 1. Install necessary libs\n", + "```shell\n", + "!pip install modelscope\n", + "!pip install transformers -U\n", + "!pip install llama-index llama-index-llms-huggingface ipywidgets \n", + "```\n", + "\n", + "## 2. Download data files we need in this example\n", + "```shell\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/stopwords.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n", + "\n", + "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n", + "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n", + "\n", + "!cp /mnt/workspace/punkt.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n", + "!cp /mnt/workspace/stopwords.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n", + "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers; unzip punkt.zip;\n", + "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora; unzip stopwords.zip;\n", + "\n", + "\n", + "!mkdir -p /mnt/workspace/custom_data\n", + "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n", + "\n", + "!cd /mnt/workspace\n", + "```\n", + "\n", + "## 3. Go!" + ], + "metadata": { + "collapsed": false + }, + "id": "f4abc589d9bfffca" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "!pip install modelscope\n", + "!pip install transformers -U\n", + "!pip install llama-index llama-index-llms-huggingface ipywidgets " + ], + "metadata": { + "collapsed": false + }, + "id": "c32122833dd7b8c8" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/punkt.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/stopwords.zip\n", + "!wget https://modelscope.oss-cn-beijing.aliyuncs.com/resource/rag/xianjiaoda.md\n", + "\n", + "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n", + "!mkdir -p /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n", + "\n", + "!cp /mnt/workspace/punkt.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers\n", + "!cp /mnt/workspace/stopwords.zip /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora\n", + "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/tokenizers; unzip punkt.zip;\n", + "!cd /opt/conda/lib/python3.10/site-packages/llama_index/core/_static/nltk_cache/corpora; unzip stopwords.zip;\n", + "\n", + "\n", + "!mkdir -p /mnt/workspace/custom_data\n", + "!mv /mnt/workspace/xianjiaoda.md /mnt/workspace/custom_data\n", + "\n", + "!cd /mnt/workspace" + ], + "metadata": { + "collapsed": false + }, + "id": "63704e2b21a9ba52" + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "import logging\n", + "import sys\n", + "from abc import ABC\n", + "from typing import Any, List\n", + "\n", + "import torch\n", + "from llama_index.core import (\n", + " SimpleDirectoryReader,\n", + " VectorStoreIndex,\n", + " Settings,\n", + " ServiceContext,\n", + " set_global_service_context,\n", + ")\n", + "from llama_index.core.base.embeddings.base import BaseEmbedding, Embedding\n", + "from llama_index.core.prompts import PromptTemplate\n", + "from llama_index.llms.huggingface import HuggingFaceLLM\n", + "\n", + "from modelscope import snapshot_download\n", + "\n", + "logging.basicConfig(stream=sys.stdout, level=logging.INFO)\n", + "logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))\n", + "\n", + "# download QWEN model from modelscope\n", + "qwen15_4B_CHAT = \"qwen/Qwen1.5-4B-Chat\"\n", + "selected_model = snapshot_download(qwen15_4B_CHAT)\n", + "\n", + "# define sys prompt\n", + "SYSTEM_PROMPT = \"\"\"You are a helpful AI assistant.\"\"\"\n", + "query_wrapper_prompt = PromptTemplate(\n", + " \"[INST]<>\\n\" + SYSTEM_PROMPT + \"<>\\n\\n{query_str}[/INST] \"\n", + ")\n", + "\n", + "# create HuggingFaceLLM with qwen1.5 \n", + "llm = HuggingFaceLLM(\n", + " context_window=4096,\n", + " max_new_tokens=2048,\n", + " generate_kwargs={\"temperature\": 0.0, \"do_sample\": False},\n", + " query_wrapper_prompt=query_wrapper_prompt,\n", + " tokenizer_name=selected_model,\n", + " model_name=selected_model,\n", + " device_map=\"auto\",\n", + " # change these settings below depending on your GPU\n", + " model_kwargs={\"torch_dtype\": torch.float16},\n", + ")\n", + "print(\"llm created\")\n", + "\n", + "\n", + "# wrap modelscope embedding for llama-index (based on BaseEmbedding)\n", + "class ModelScopeEmbeddings4LlamaIndex(BaseEmbedding, ABC):\n", + " embed: Any = None\n", + " model_id: str = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n", + "\n", + " def __init__(\n", + " self,\n", + " model_id: str,\n", + " **kwargs: Any,\n", + " ) -> None:\n", + " super().__init__(**kwargs)\n", + " try:\n", + " from modelscope.models import Model\n", + " from modelscope.pipelines import pipeline\n", + " from modelscope.utils.constant import Tasks\n", + " # 使用modelscope的embedding模型(包含下载)\n", + " self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id)\n", + "\n", + " except ImportError as e:\n", + " raise ValueError(\n", + " \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n", + " ) from e\n", + "\n", + " def _get_query_embedding(self, query: str) -> Embedding:\n", + " text = query.replace(\"\\n\", \" \")\n", + " inputs = {\"source_sentence\": [text]}\n", + " # note that we have to call tolist() to change numpy.ndarray into python list\n", + " return self.embed(input=inputs)['text_embedding'][0].tolist()\n", + "\n", + " def _get_text_embedding(self, text: str) -> Embedding:\n", + " text = text.replace(\"\\n\", \" \")\n", + " inputs = {\"source_sentence\": [text]}\n", + " return self.embed(input=inputs)['text_embedding'][0].tolist()\n", + "\n", + " def _get_text_embeddings(self, texts: List[str]) -> List[Embedding]:\n", + " texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n", + " inputs = {\"source_sentence\": texts}\n", + " return self.embed(input=inputs)['text_embedding'].tolist()\n", + "\n", + " async def _aget_query_embedding(self, query: str) -> Embedding:\n", + " return self._get_query_embedding(query)\n", + "\n", + "\n", + "embedding_model = \"damo/nlp_gte_sentence-embedding_chinese-base\"\n", + "embeddings = ModelScopeEmbeddings4LlamaIndex(model_id=embedding_model)\n", + "service_context = ServiceContext.from_defaults(embed_model=embeddings, llm=llm)\n", + "set_global_service_context(service_context)\n", + "Settings.embed_model = embeddings\n", + "\n", + "# load example documents\n", + "documents = SimpleDirectoryReader(\"/mnt/workspace/custom_data/\").load_data()\n", + "\n", + "# create Vector DB\n", + "index = VectorStoreIndex.from_documents(documents)\n", + "\n", + "# set Logging to DEBUG for more detailed outputs\n", + "query_engine = index.as_query_engine()\n", + "\n", + "# do query\n", + "response = query_engine.query(\"西安较大的校训是什么\")\n", + "print(response)\n" + ], + "metadata": { + "collapsed": false + }, + "id": "eef67659e94045c5" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git "a/examples/pytorch/stable_diffusion/SD\346\216\250\347\220\206\346\234\200\344\275\263\345\256\236\350\267\265.ipynb" "b/examples/pytorch/stable_diffusion/SD\346\216\250\347\220\206\346\234\200\344\275\263\345\256\236\350\267\265.ipynb" new file mode 100644 index 000000000..234859a7f --- /dev/null +++ "b/examples/pytorch/stable_diffusion/SD\346\216\250\347\220\206\346\234\200\344\275\263\345\256\236\350\267\265.ipynb" @@ -0,0 +1,358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "89373920-4a59-473e-8b7d-7f30570637c7", + "metadata": {}, + "source": [ + "Stable diffusion模型推理方法1:SDXL模型,魔搭社区Pipeline已经集成SDXL模型,可以直接使用" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "641a04c4-ee0b-4cef-93e2-bca0269e7486", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from modelscope.utils.constant import Tasks\n", + "from modelscope.pipelines import pipeline\n", + "import cv2\n", + "\n", + "pipe = pipeline(task=Tasks.text_to_image_synthesis, \n", + " model='AI-ModelScope/stable-diffusion-xl-base-1.0',\n", + " use_safetensors=True,\n", + " model_revision='v1.0.0')\n", + "\n", + "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n", + "output = pipe({'text': prompt})\n", + "cv2.imwrite('SDXL.png', output['output_imgs'][0])" + ] + }, + { + "cell_type": "markdown", + "id": "c5740ed4-2c6a-4b0b-8bb7-6ef466d2a08f", + "metadata": {}, + "source": [ + "秒级推理方法1:SDXL-turbo模型是SDXL 1.0的蒸馏版本,SDXL-Turbo基于一种称之为对抗扩散蒸馏(ADD)的新颖的训练方法,这种方法在扩散模型采样可以减少到1到4步,而生成高质量图像。ADD的训练方式使用得分蒸馏,利用大规模扩散模型作为教师模型,并将其与对抗性损失相结合,即使在1-2步的采样步骤的低步骤状态下,使用对抗学习的方式,引入discriminator来辅助生成质量的把控,也可以确保高质量图像的保真度。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bef68ad6-1fc9-4fff-850e-9bd4cc3ef756", + "metadata": {}, + "outputs": [], + "source": [ + "from diffusers import AutoPipelineForText2Image\n", + "import torch\n", + "from modelscope import snapshot_download\n", + "\n", + "model_dir = snapshot_download(\"AI-ModelScope/sdxl-turbo\")\n", + "\n", + "pipe = AutoPipelineForText2Image.from_pretrained(model_dir, torch_dtype=torch.float16, variant=\"fp16\")\n", + "pipe.to(\"cuda\")\n", + "\n", + "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n", + "\n", + "image = pipe(prompt=prompt, num_inference_steps=1, guidance_scale=0.0).images[0]\n", + "image.save(\"SDXLturbo.png\")" + ] + }, + { + "cell_type": "markdown", + "id": "bf25d186-317e-4e53-bed5-c801b336b3ff", + "metadata": {}, + "source": [ + "秒级推理方法2:SDXL+LCM,潜在一致性模型(LCM)受一致性模型(CM)启发,在预训练的LDM上以较少的步骤进行快速推理。LCM-SD系列是在Stable Diffusion的基础上新增Consistency 约束蒸馏的结果,仅通过2-8步的推理即可实现高质量的文本到图片的生成性能。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58e1b7b6-f2d1-4a04-9a31-108f567b5c64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler\n", + "import torch\n", + "from modelscope import snapshot_download\n", + "\n", + "model_dir_lcm = snapshot_download(\"AI-ModelScope/lcm-sdxl\",revision = \"master\")\n", + "model_dir_sdxl = snapshot_download(\"AI-ModelScope/stable-diffusion-xl-base-1.0\",revision = \"v1.0.9\")\n", + "\n", + "unet = UNet2DConditionModel.from_pretrained(model_dir_lcm, torch_dtype=torch.float16, variant=\"fp16\")\n", + "pipe = DiffusionPipeline.from_pretrained(model_dir_sdxl, unet=unet, torch_dtype=torch.float16, variant=\"fp16\")\n", + "\n", + "pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)\n", + "pipe.to(\"cuda\")\n", + "\n", + "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n", + "image = pipe(prompt, num_inference_steps=4, guidance_scale=8.0).images[0]\n", + "image.save(\"SDXLLCM.png\")" + ] + }, + { + "cell_type": "markdown", + "id": "ec6a4dda-2d8c-4fb5-bcbd-468462d9e3c6", + "metadata": {}, + "source": [ + "秒级推理方法3:stable-cascade模型基于Würstchen架构构建,与稳定扩散等其他模型的主要区别在于它在更小的潜在空间中工作。潜在空间越小,推理速度就越快,训练成本也就越低。潜在空间有多小?稳定扩散使用压缩因子 8,从而将 1024x1024 图像编码为 128x128。Stable Cascade 的压缩系数为 42,这意味着可以将 1024x1024 图像编码为 24x24,同时保持清晰的重建。然后在高度压缩的潜在空间中训练文本条件模型。与稳定扩散 1.5 相比,该架构的先前版本实现了 16 倍的成本降低。" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4155f18d-0504-42e6-b785-02ed4a519c1f", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from modelscope import snapshot_download\n", + "from diffusers import StableCascadeDecoderPipeline, StableCascadePriorPipeline\n", + "\n", + "device = \"cuda\"\n", + "num_images_per_prompt = 1\n", + "\n", + "stable_cascade_prior = snapshot_download(\"AI-ModelScope/stable-cascade-prior\")\n", + "stable_cascade = snapshot_download(\"AI-ModelScope/stable-cascade\")\n", + "\n", + "prior = StableCascadePriorPipeline.from_pretrained(stable_cascade_prior, torch_dtype=torch.bfloat16).to(device)\n", + "decoder = StableCascadeDecoderPipeline.from_pretrained(stable_cascade, torch_dtype=torch.float16).to(device)\n", + "\n", + "prompt = \"Beautiful and cute girl, 16 years old, denim jacket, gradient background, soft colors, soft lighting, cinematic edge lighting, light and dark contrast, anime, art station Seraflur, blind box, super detail, 8k\"\n", + "negative_prompt = \"\"\n", + "\n", + "prior_output = prior(\n", + " prompt=prompt,\n", + " height=1024,\n", + " width=1024,\n", + " negative_prompt=negative_prompt,\n", + " guidance_scale=4.0,\n", + " num_images_per_prompt=num_images_per_prompt,\n", + " num_inference_steps=20\n", + ")\n", + "decoder_output = decoder(\n", + " image_embeddings=prior_output.image_embeddings.half(),\n", + " prompt=prompt,\n", + " negative_prompt=negative_prompt,\n", + " guidance_scale=0.0,\n", + " output_type=\"pil\",\n", + " num_inference_steps=10\n", + ").images\n", + "\n", + "for i, img in enumerate(decoder_output):\n", + " img.save(f\"stablecascade_{i+1}.png\")\n", + "#Now decoder_output is a list with your PIL images" + ] + }, + { + "cell_type": "markdown", + "id": "c402e461-2245-4e38-839b-6a5992c03b00", + "metadata": {}, + "source": [ + "秒级推理方法4:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f42531c8-c428-4ae7-aef1-b56050bffc71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, EulerDiscreteScheduler\n", + "from modelscope.hub.file_download import model_file_download\n", + "from modelscope import snapshot_download\n", + "from safetensors.torch import load_file\n", + "\n", + "base = snapshot_download(\"AI-ModelScope/stable-diffusion-xl-base-1.0\")\n", + "repo = \"AI-ModelScope/SDXL-Lightning\"\n", + "ckpt = \"sdxl_lightning_4step_unet.safetensors\" # Use the correct ckpt for your step setting!\n", + "\n", + "# Load model.\n", + "unet = UNet2DConditionModel.from_config(base, subfolder=\"unet\").to(\"cuda\", torch.float16)\n", + "unet.load_state_dict(load_file(model_file_download(repo, ckpt), device=\"cuda\"))\n", + "pipe = StableDiffusionXLPipeline.from_pretrained(base, unet=unet, torch_dtype=torch.float16, variant=\"fp16\").to(\"cuda\")\n", + "\n", + "# Ensure sampler uses \"trailing\" timesteps.\n", + "pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config, timestep_spacing=\"trailing\")\n", + "\n", + "# Ensure using the same inference steps as the loaded model and CFG set to 0.\n", + "pipe(\"A girl smiling\", num_inference_steps=4, guidance_scale=0).images[0].save(\"sdxllightning.png\")" + ] + }, + { + "cell_type": "markdown", + "id": "adbedb78-90fb-4509-a3a6-6262d0d51bcf", + "metadata": {}, + "source": [ + "微调lora叠加推理" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c418dc94-6c35-4ac2-8807-e796d5488525", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "tags": [] + }, + "outputs": [], + "source": [ + "from diffusers import AutoPipelineForText2Image\n", + "from modelscope import snapshot_download\n", + "import torch\n", + "\n", + "model_dir=snapshot_download(\"YorickHe/majicmixRealistic_v6\")\n", + "lora_dir = snapshot_download(\"PaperCloud/zju19_dunhuang_style_lora\")\n", + "\n", + "pipeline = AutoPipelineForText2Image.from_pretrained(f\"{model_dir}/v7\", torch_dtype=torch.float16).to(\"cuda\")\n", + "pipeline.load_lora_weights(lora_dir, weight_name=\"dunhuang.safetensors\")\n", + "prompt = \"1 girl, close-up, waist shot, black long hair, clean face, dunhuang, Chinese ancient style, clean skin, organza_lace, Dunhuang wind, Art deco, Necklace, jewelry, Bracelet, Earrings, dunhuang_style, see-through_dress, Expressionism, looking towards the camera, upper_body, raw photo, masterpiece, solo, medium shot, high detail face, photorealistic, best quality\"\n", + "#Negative Prompt = \"\"\"(nsfw:2), paintings, sketches, (worst quality:2), (low quality:2), lowers, normal quality, ((monochrome)), ((grayscale)), logo, word, character, bad hand, tattoo, (username, watermark, signature, time signature, timestamp, artist name, copyright name, copyright),low res, ((monochrome)), ((grayscale)), skin spots, acnes, skin blemishes, age spot, glans, extra fingers, fewer fingers, strange fingers, bad hand, mole, ((extra legs)), ((extra hands))\"\"\"\n", + "image = pipeline(prompt).images[0]\n", + "image.save(\"sdlora.png\")\n" + ] + }, + { + "cell_type": "markdown", + "id": "6c36c14f-9481-48f1-a6ef-617d7551b63d", + "metadata": {}, + "source": [ + "SD+controlnet" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1f1c616d-0d45-4a8d-8140-0b6b352920b9", + "metadata": { + "ExecutionIndicator": { + "show": true + }, + "execution": { + "iopub.execute_input": "2024-02-28T00:22:32.730370Z", + "iopub.status.busy": "2024-02-28T00:22:32.729999Z", + "iopub.status.idle": "2024-02-28T00:23:48.650291Z", + "shell.execute_reply": "2024-02-28T00:23:48.649123Z", + "shell.execute_reply.started": "2024-02-28T00:22:32.730354Z" + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "2024-02-28 08:22:35.104069: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2024-02-28 08:22:35.132215: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2024-02-28 08:22:35.174367: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "2024-02-28 08:22:35.174385: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "2024-02-28 08:22:35.174411: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-02-28 08:22:35.182970: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.\n", + "2024-02-28 08:22:35.183413: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n", + "2024-02-28 08:22:36.189620: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n", + "2024-02-28 08:22:39,294 - modelscope - INFO - PyTorch version 2.1.2+cu121 Found.\n", + "2024-02-28 08:22:39,296 - modelscope - INFO - TensorFlow version 2.14.0 Found.\n", + "2024-02-28 08:22:39,296 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer\n", + "2024-02-28 08:22:39,341 - modelscope - INFO - Loading done! Current index file version is 1.12.0, with md5 509123dba36c5e70a95f6780df348471 and a total number of 964 components indexed\n", + "2024-02-28 08:22:39,713 - modelscope - WARNING - Model revision not specified, use revision: v1.0.9\n", + "Loading pipeline components...: 100%|██████████| 7/7 [00:36<00:00, 5.19s/it]\n", + "100%|██████████| 50/50 [00:15<00:00, 3.24it/s]\n" + ] + } + ], + "source": [ + "from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL\n", + "from diffusers.utils import load_image, make_image_grid\n", + "from PIL import Image\n", + "from modelscope import snapshot_download\n", + "import cv2\n", + "import numpy as np\n", + "import torch\n", + "\n", + "\n", + "model_dir = snapshot_download(\"AI-ModelScope/stable-diffusion-xl-base-1.0\")\n", + "controlnet_dir = snapshot_download(\"AI-ModelScope/controlnet-canny-sdxl-1.0\")\n", + "VAE_dir = snapshot_download(\"AI-ModelScope/sdxl-vae-fp16-fix\")\n", + "original_image = load_image(\n", + " \"/mnt/workspace/canny.jpg\"\n", + ")\n", + "\n", + "prompt = \"sea turtle, hard lighting\"\n", + "negative_prompt = 'low quality, bad quality, sketches'\n", + "\n", + "image = load_image(\"/mnt/workspace/canny.jpg\")\n", + "\n", + "controlnet_conditioning_scale = 0.5 # recommended for good generalization\n", + "\n", + "controlnet = ControlNetModel.from_pretrained(\n", + " controlnet_dir,\n", + " torch_dtype=torch.float16\n", + ")\n", + "vae = AutoencoderKL.from_pretrained(VAE_dir, torch_dtype=torch.float16)\n", + "pipe = StableDiffusionXLControlNetPipeline.from_pretrained(\n", + " model_dir,\n", + " controlnet=controlnet,\n", + " vae=vae,\n", + " torch_dtype=torch.float16,\n", + ")\n", + "pipe.enable_model_cpu_offload()\n", + "\n", + "image = np.array(image)\n", + "image = cv2.Canny(image, 100, 200)\n", + "image = image[:, :, None]\n", + "image = np.concatenate([image, image, image], axis=2)\n", + "image = Image.fromarray(image)\n", + "\n", + "images = pipe(\n", + " prompt, negative_prompt=negative_prompt, image=image, controlnet_conditioning_scale=controlnet_conditioning_scale,\n", + " ).images\n", + "\n", + "images[0].save(f\"controlnet.png\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/modelscope/__init__.py b/modelscope/__init__.py index 97abdbd3d..c969be684 100644 --- a/modelscope/__init__.py +++ b/modelscope/__init__.py @@ -1,15 +1,17 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import TYPE_CHECKING -from modelscope.utils.import_utils import LazyImportModule -from .utils.automodel_utils import fix_transformers_upgrade +from modelscope.utils.import_utils import (LazyImportModule, + is_transformers_available) if TYPE_CHECKING: from .exporters import Exporter, TfModelExporter, TorchModelExporter from .hub.api import HubApi from .hub.check_model import check_local_model_is_latest, check_model_is_id from .hub.push_to_hub import push_to_hub, push_to_hub_async - from .hub.snapshot_download import snapshot_download + from .hub.snapshot_download import snapshot_download, dataset_snapshot_download + from .hub.file_download import model_file_download, dataset_file_download + from .metrics import ( AccuracyMetric, AudioNoiseMetric, BleuMetric, ImageColorEnhanceMetric, ImageColorizationMetric, ImageDenoiseMetric, ImageInpaintingMetric, @@ -29,13 +31,34 @@ from .trainers import (EpochBasedTrainer, Hook, Priority, TrainingArgs, build_dataset_from_file) from .utils.constant import Tasks - from .utils.hf_util import AutoConfig, GPTQConfig, BitsAndBytesConfig - from .utils.hf_util import (AutoModel, AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoModelForSequenceClassification, - AutoModelForTokenClassification, AutoTokenizer, - GenerationConfig, AutoImageProcessor, - BatchFeature) + if is_transformers_available(): + from .utils.hf_util import ( + AutoModel, AutoProcessor, AutoFeatureExtractor, GenerationConfig, + AutoConfig, GPTQConfig, AwqConfig, BitsAndBytesConfig, + AutoModelForCausalLM, AutoModelForSeq2SeqLM, + AutoModelForVision2Seq, AutoModelForSequenceClassification, + AutoModelForTokenClassification, AutoModelForImageClassification, + AutoModelForImageTextToText, + AutoModelForZeroShotImageClassification, + AutoModelForKeypointDetection, + AutoModelForDocumentQuestionAnswering, + AutoModelForSemanticSegmentation, + AutoModelForUniversalSegmentation, + AutoModelForInstanceSegmentation, AutoModelForObjectDetection, + AutoModelForZeroShotObjectDetection, + AutoModelForAudioClassification, AutoModelForSpeechSeq2Seq, + AutoModelForMaskedImageModeling, + AutoModelForVisualQuestionAnswering, + AutoModelForTableQuestionAnswering, AutoModelForImageToImage, + AutoModelForImageSegmentation, AutoModelForQuestionAnswering, + AutoModelForMaskedLM, AutoTokenizer, AutoModelForMaskGeneration, + AutoModelForPreTraining, AutoModelForTextEncoding, + AutoImageProcessor, BatchFeature, Qwen2VLForConditionalGeneration, + T5EncoderModel) + else: + print( + 'transformer is not installed, please install it if you want to use related modules' + ) from .utils.hub import create_model_if_not_exist, read_config from .utils.logger import get_logger from .version import __release_datetime__, __version__ @@ -53,7 +76,9 @@ 'TorchModelExporter', ], 'hub.api': ['HubApi'], - 'hub.snapshot_download': ['snapshot_download'], + 'hub.snapshot_download': + ['snapshot_download', 'dataset_snapshot_download'], + 'hub.file_download': ['model_file_download', 'dataset_file_download'], 'hub.push_to_hub': ['push_to_hub', 'push_to_hub_async'], 'hub.check_model': ['check_model_is_id', 'check_local_model_is_latest'], @@ -78,17 +103,37 @@ 'utils.hub': ['read_config', 'create_model_if_not_exist'], 'utils.logger': ['get_logger'], 'utils.constant': ['Tasks'], - 'utils.hf_util': [ - 'AutoConfig', 'GenerationConfig', 'AutoModel', 'GPTQConfig', - 'BitsAndBytesConfig', 'AutoModelForCausalLM', - 'AutoModelForSeq2SeqLM', 'AutoTokenizer', - 'AutoModelForSequenceClassification', - 'AutoModelForTokenClassification', 'AutoImageProcessor', - 'BatchFeature' - ], 'msdatasets': ['MsDataset'] } + if is_transformers_available(): + _import_structure['utils.hf_util'] = [ + 'AutoModel', 'AutoProcessor', 'AutoFeatureExtractor', + 'GenerationConfig', 'AutoConfig', 'GPTQConfig', 'AwqConfig', + 'BitsAndBytesConfig', 'AutoModelForCausalLM', + 'AutoModelForSeq2SeqLM', 'AutoModelForVision2Seq', + 'AutoModelForSequenceClassification', + 'AutoModelForTokenClassification', + 'AutoModelForImageClassification', 'AutoModelForImageToImage', + 'AutoModelForImageTextToText', + 'AutoModelForZeroShotImageClassification', + 'AutoModelForKeypointDetection', + 'AutoModelForDocumentQuestionAnswering', + 'AutoModelForSemanticSegmentation', + 'AutoModelForUniversalSegmentation', + 'AutoModelForInstanceSegmentation', 'AutoModelForObjectDetection', + 'AutoModelForZeroShotObjectDetection', + 'AutoModelForAudioClassification', 'AutoModelForSpeechSeq2Seq', + 'AutoModelForMaskedImageModeling', + 'AutoModelForVisualQuestionAnswering', + 'AutoModelForTableQuestionAnswering', + 'AutoModelForImageSegmentation', 'AutoModelForQuestionAnswering', + 'AutoModelForMaskedLM', 'AutoTokenizer', + 'AutoModelForMaskGeneration', 'AutoModelForPreTraining', + 'AutoModelForTextEncoding', 'AutoImageProcessor', 'BatchFeature', + 'Qwen2VLForConditionalGeneration', 'T5EncoderModel' + ] + import sys sys.modules[__name__] = LazyImportModule( @@ -98,5 +143,3 @@ module_spec=__spec__, extra_objects={}, ) - -fix_transformers_upgrade() diff --git a/modelscope/cli/clearcache.py b/modelscope/cli/clearcache.py new file mode 100644 index 000000000..dcd3d1dfe --- /dev/null +++ b/modelscope/cli/clearcache.py @@ -0,0 +1,113 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import shutil +from argparse import ArgumentParser +from pathlib import Path + +from modelscope.cli.base import CLICommand +from modelscope.hub.constants import TEMPORARY_FOLDER_NAME +from modelscope.hub.utils.utils import get_model_masked_directory + + +def subparser_func(args): + """ Function which will be called for a specific sub parser. + """ + return ClearCacheCMD(args) + + +class ClearCacheCMD(CLICommand): + name = 'clear-cache' + + def __init__(self, args): + self.args = args + self.cache_dir = os.getenv( + 'MODELSCOPE_CACHE', + Path.home().joinpath('.cache', 'modelscope')) + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for clear-cache command. + """ + parser = parsers.add_parser(ClearCacheCMD.name) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--model', + type=str, + help= + 'The id of the model whose cache will be cleared. For clear-cache, ' + 'if neither model or dataset id is provided, entire cache will be cleared.' + ) + group.add_argument( + '--dataset', + type=str, + help= + 'The id of the dataset whose cache will be cleared. For clear-cache, ' + 'if neither model or dataset id is provided, entire cache will be cleared.' + ) + + parser.set_defaults(func=subparser_func) + + def execute(self): + self._execute_with_confirmation() + + def _execute_with_confirmation(self): + all = False + single_model = False + prompt = '\nYou are about to delete ' + + if self.args.model or self.args.dataset: + if self.args.model: + id = self.args.model + single_model = True + prompt = prompt + f'local cache for model {id}. ' + else: + id = self.args.dataset + prompt = prompt + f'local cache for dataset {id}. ' + else: + prompt = prompt + f'entire ModelScope cache at {self.cache_dir}, including ALL models and dataset.\n' + all = True + user_input = input( + prompt + + '\nPlease press Y or y to proceed, any other key to abort.\n' + ).strip().upper() + + if user_input == 'Y': + if all: + self._remove_directory(self.cache_dir) + print('Cache cleared.') + else: + entity_directory = os.path.join( + self.cache_dir, 'hub' if single_model else 'datasets', id) + temp_directory = os.path.join( + self.cache_dir, 'hub' if single_model else 'datasets', + TEMPORARY_FOLDER_NAME, id) + entity_removed = self._remove_directory(entity_directory) + temp_removed = self._remove_directory(temp_directory) + if (not entity_removed) and (not temp_removed): + if single_model: + print( + f'Cache for Model {id} not found. Nothing to do.') + else: + print( + f'Cache for Dataset {id} not found. Nothing to do.' + ) + else: + print('Cache cleared.') + else: + print('Operation aborted.') + return + + def _remove_directory(self, path): + if os.path.exists(path): + try: + if os.path.islink(path): + shutil.rmtree(os.readlink(path)) + os.remove(path) + print(f'Cache and link for {path} removed.') + else: + shutil.rmtree(path) + print(f'Cache folder {path} removed.') + return True + except Exception as e: + print(f'An error occurred while clearing cache at {path}: {e}') + return False diff --git a/modelscope/cli/cli.py b/modelscope/cli/cli.py index a25502fde..24fcc134f 100644 --- a/modelscope/cli/cli.py +++ b/modelscope/cli/cli.py @@ -1,29 +1,46 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import argparse +import logging +from modelscope.cli.clearcache import ClearCacheCMD from modelscope.cli.download import DownloadCMD +from modelscope.cli.llamafile import LlamafileCMD +from modelscope.cli.login import LoginCMD from modelscope.cli.modelcard import ModelCardCMD from modelscope.cli.pipeline import PipelineCMD from modelscope.cli.plugins import PluginsCMD +from modelscope.cli.server import ServerCMD +from modelscope.hub.api import HubApi +from modelscope.utils.logger import get_logger + +logger = get_logger(log_level=logging.WARNING) def run_cmd(): parser = argparse.ArgumentParser( 'ModelScope Command Line tool', usage='modelscope []') + parser.add_argument( + '--token', default=None, help='Specify ModelScope SDK token.') subparsers = parser.add_subparsers(help='modelscope commands helpers') DownloadCMD.define_args(subparsers) + ClearCacheCMD.define_args(subparsers) PluginsCMD.define_args(subparsers) PipelineCMD.define_args(subparsers) ModelCardCMD.define_args(subparsers) + ServerCMD.define_args(subparsers) + LoginCMD.define_args(subparsers) + LlamafileCMD.define_args(subparsers) args = parser.parse_args() if not hasattr(args, 'func'): parser.print_help() exit(1) - + if args.token is not None: + api = HubApi() + api.login(args.token) cmd = args.func(args) cmd.execute() diff --git a/modelscope/cli/download.py b/modelscope/cli/download.py index e6d316a29..aa23a3019 100644 --- a/modelscope/cli/download.py +++ b/modelscope/cli/download.py @@ -3,7 +3,10 @@ from argparse import ArgumentParser from modelscope.cli.base import CLICommand -from modelscope.hub.snapshot_download import snapshot_download +from modelscope.hub.file_download import (dataset_file_download, + model_file_download) +from modelscope.hub.snapshot_download import (dataset_snapshot_download, + snapshot_download) def subparser_func(args): @@ -22,9 +25,32 @@ def __init__(self, args): def define_args(parsers: ArgumentParser): """ define args for download command. """ - parser = parsers.add_parser(DownloadCMD.name) + parser: ArgumentParser = parsers.add_parser(DownloadCMD.name) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--model', + type=str, + help='The id of the model to be downloaded. For download, ' + 'the id of either a model or dataset must be provided.') + group.add_argument( + '--dataset', + type=str, + help='The id of the dataset to be downloaded. For download, ' + 'the id of either a model or dataset must be provided.') + parser.add_argument( + 'repo_id', + type=str, + nargs='?', + default=None, + help='Optional, ' + 'ID of the repo to download, It can also be set by --model or --dataset.' + ) parser.add_argument( - 'model', type=str, help='Name of the model to be downloaded.') + '--repo-type', + choices=['model', 'dataset'], + default='model', + help="Type of repo to download from (defaults to 'model').", + ) parser.add_argument( '--revision', type=str, @@ -35,10 +61,105 @@ def define_args(parsers: ArgumentParser): type=str, default=None, help='Cache directory to save model.') + parser.add_argument( + '--local_dir', + type=str, + default=None, + help='File will be downloaded to local location specified by' + 'local_dir, in this case, cache_dir parameter will be ignored.') + parser.add_argument( + 'files', + type=str, + default=None, + nargs='*', + help='Specify relative path to the repository file(s) to download.' + "(e.g 'tokenizer.json', 'onnx/decoder_model.onnx').") + parser.add_argument( + '--include', + nargs='*', + default=None, + type=str, + help='Glob patterns to match files to download.' + 'Ignored if file is specified') + parser.add_argument( + '--exclude', + nargs='*', + type=str, + default=None, + help='Glob patterns to exclude from files to download.' + 'Ignored if file is specified') parser.set_defaults(func=subparser_func) def execute(self): - snapshot_download( - self.args.model, - cache_dir=self.args.cache_dir, - revision=self.args.revision) + if self.args.model or self.args.dataset: + # the position argument of files will be put to repo_id. + if self.args.repo_id is not None: + if self.args.files: + self.args.files.insert(0, self.args.repo_id) + else: + self.args.files = [self.args.repo_id] + else: + if self.args.repo_id is not None: + if self.args.repo_type == 'model': + self.args.model = self.args.repo_id + elif self.args.repo_type == 'dataset': + self.args.dataset = self.args.repo_id + else: + raise Exception('Not support repo-type: %s' + % self.args.repo_type) + if not self.args.model and not self.args.dataset: + raise Exception('Model or dataset must be set.') + if self.args.model: + if len(self.args.files) == 1: # download single file + model_file_download( + self.args.model, + self.args.files[0], + cache_dir=self.args.cache_dir, + local_dir=self.args.local_dir, + revision=self.args.revision) + elif len( + self.args.files) > 1: # download specified multiple files. + snapshot_download( + self.args.model, + revision=self.args.revision, + cache_dir=self.args.cache_dir, + local_dir=self.args.local_dir, + allow_file_pattern=self.args.files, + ) + else: # download repo + snapshot_download( + self.args.model, + revision=self.args.revision, + cache_dir=self.args.cache_dir, + local_dir=self.args.local_dir, + allow_file_pattern=self.args.include, + ignore_file_pattern=self.args.exclude, + ) + elif self.args.dataset: + if len(self.args.files) == 1: # download single file + dataset_file_download( + self.args.dataset, + self.args.files[0], + cache_dir=self.args.cache_dir, + local_dir=self.args.local_dir, + revision=self.args.revision) + elif len( + self.args.files) > 1: # download specified multiple files. + dataset_snapshot_download( + self.args.dataset, + revision=self.args.revision, + cache_dir=self.args.cache_dir, + local_dir=self.args.local_dir, + allow_file_pattern=self.args.files, + ) + else: # download repo + dataset_snapshot_download( + self.args.dataset, + revision=self.args.revision, + cache_dir=self.args.cache_dir, + local_dir=self.args.local_dir, + allow_file_pattern=self.args.include, + ignore_file_pattern=self.args.exclude, + ) + else: + pass # noop diff --git a/modelscope/cli/llamafile.py b/modelscope/cli/llamafile.py new file mode 100644 index 000000000..23f3fe914 --- /dev/null +++ b/modelscope/cli/llamafile.py @@ -0,0 +1,158 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import logging +import os +import sys +from argparse import ArgumentParser + +from modelscope import model_file_download +from modelscope.cli.base import CLICommand +from modelscope.hub.api import HubApi +from modelscope.utils.logger import get_logger + +logger = get_logger(log_level=logging.WARNING) + + +def subparser_func(args): + """ Function which will be called for a specific sub parser. + """ + return LlamafileCMD(args) + + +class LlamafileCMD(CLICommand): + name = 'llamafile' + + def __init__(self, args): + self.args = args + self.model_id = self.args.model + if self.model_id is None or self.model_id.count('/') != 1: + raise ValueError(f'Invalid model id [{self.model_id}].') + if self.args.file is not None: + # ignore accuracy if file argument is provided + self.args.accuracy = None + if not self.args.file.lower().endswith('.llamafile'): + raise ValueError('file argument must ends with ".llamafile".') + self.api = HubApi() + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for clear-cache command. + """ + parser = parsers.add_parser(LlamafileCMD.name) + parser.add_argument( + '--model', + type=str, + required=True, + help= + 'The id of the model, whose repo must contain at least one llamafile' + ) + group = parser.add_mutually_exclusive_group() + group.add_argument( + '--accuracy', + type=str, + required=False, + default='q4_k_m', + help= + 'Selected accuracy of GGUF files in the repo. Ignored when "file" is also provided.' + ) + + group.add_argument( + '--file', + type=str, + required=False, + help= + 'The name of a specified llamafile in the model repo. This takes precedence over "accuracy".' + ) + + parser.add_argument( + '--local_dir', + type=str, + default=None, + help= + 'Directory where the selected llamafile would will be downloaded to.' + ) + + group.add_argument( + '--launch', + type=str, + required=False, + default='True', + help= + 'Whether to launch model with the downloaded llamafile, default to True.' + ) + + parser.set_defaults(func=subparser_func) + + def execute(self): + if self.args.file: + self.args.accuracy = None + + all_files = self.api.get_model_files(self.model_id, recursive=True) + llamafiles = [] + for info in all_files: + file_path = info['Path'] + if file_path and file_path.lower().endswith( + '.llamafile') and '-of-' not in file_path.lower(): + llamafiles.append(file_path) + if not llamafiles: + raise ValueError( + f'Cannot locate a valid llamafile in repo {self.model_id}.') + logger.info( + f'list of llamafiles in repo {self.model_id}:\n{llamafiles}.') + # default choose the first llamafile if there is no q4_k_m, and no accuracy or file is specified + selected_file = llamafiles[0] + found = False + for f in llamafiles: + if self.args.file and f == self.args.file: + selected_file = f + found = True + break + if self.args.accuracy and self.args.accuracy.lower() in f.lower(): + selected_file = f + found = True + break + if found: + print(f'llamafile matching criteria found: [{selected_file}].') + else: + print( + f'No matched llamafile found in repo, choosing the first llamafile in repo: [{selected_file}]' + ) + downloaded_file = os.path.abspath( + model_file_download( + self.args.model, selected_file, local_dir=self.args.local_dir)) + + if sys.platform.startswith('win'): + downloaded_file = self._rename_extension(downloaded_file) + + if self.args.launch.lower() == 'true': + print(f'Launching model with llamafile [{downloaded_file}]:') + self._execute_llamafile(downloaded_file) + else: + print( + f'No Launching. Llamafile model downloaded to [{downloaded_file}], you may execute it separately.' + ) + + def _execute_llamafile(self, file_path): + current_mode = os.stat(file_path).st_mode + new_mode = current_mode | 0o111 + os.chmod(file_path, new_mode) + execute_cmd = file_path + has_gpu = False + try: + import torch + has_gpu = torch.cuda.is_available() + except ModuleNotFoundError: + # we depend on torch to detect gpu. + # if torch is not available, we will just assume gpu cannot be used + pass + if has_gpu: + print( + 'GPU detected, launching model with llamafile GPU option >>>') + execute_cmd = f'{execute_cmd} -ngl 999' + os.system(execute_cmd) + + def _rename_extension(self, original_file_name): + directory, filename = os.path.split(original_file_name) + base_name, _ = os.path.splitext(filename) + new_filename = os.path.join(directory, f'{base_name}.exe') + os.rename(original_file_name, new_filename) + return new_filename diff --git a/modelscope/cli/login.py b/modelscope/cli/login.py new file mode 100644 index 000000000..613b3205a --- /dev/null +++ b/modelscope/cli/login.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +from argparse import ArgumentParser + +from modelscope.cli.base import CLICommand +from modelscope.hub.api import HubApi + + +def subparser_func(args): + """ Function which will be called for a specific sub parser. + """ + return LoginCMD(args) + + +class LoginCMD(CLICommand): + name = 'login' + + def __init__(self, args): + self.args = args + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for login command. + """ + parser = parsers.add_parser(LoginCMD.name) + parser.add_argument( + '--token', + type=str, + required=True, + help='The Access Token for modelscope.') + parser.set_defaults(func=subparser_func) + + def execute(self): + api = HubApi() + api.login(self.args.token) diff --git a/modelscope/cli/modelcard.py b/modelscope/cli/modelcard.py index 5e2b65803..646cf1b0f 100644 --- a/modelscope/cli/modelcard.py +++ b/modelscope/cli/modelcard.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import logging import os import shutil import tempfile @@ -11,7 +12,7 @@ from modelscope.hub.utils.utils import get_endpoint from modelscope.utils.logger import get_logger -logger = get_logger() +logger = get_logger(log_level=logging.WARNING) current_path = os.path.dirname(os.path.abspath(__file__)) template_path = os.path.join(current_path, 'template') @@ -29,7 +30,8 @@ class ModelCardCMD(CLICommand): def __init__(self, args): self.args = args self.api = HubApi() - self.api.login(args.access_token) + if args.access_token: + self.api.login(args.access_token) self.model_id = os.path.join( self.args.group_id, self.args.model_id ) if '/' not in self.args.model_id else self.args.model_id @@ -39,12 +41,12 @@ def __init__(self, args): def define_args(parsers: ArgumentParser): """ define args for create or upload modelcard command. """ - parser = parsers.add_parser(ModelCardCMD.name) + parser = parsers.add_parser(ModelCardCMD.name, aliases=['model']) parser.add_argument( '-tk', '--access_token', type=str, - required=True, + required=False, help='the certification of visit ModelScope') parser.add_argument( '-act', @@ -70,13 +72,15 @@ def define_args(parsers: ArgumentParser): '--visibility', type=int, default=5, - help='the visibility of ModelScope') + help= + 'the visibility of ModelScope[PRIVATE: 1, INTERNAL:3, PUBLIC:5]') parser.add_argument( '-lic', '--license', type=str, default='Apache License 2.0', - help='the license of visit ModelScope') + help='the license of visit ModelScope[Apache License 2.0|' + 'GPL-2.0|GPL-3.0|LGPL-2.1|LGPL-3.0|AFL-3.0|ECL-2.0|MIT]') parser.add_argument( '-ch', '--chinese_name', diff --git a/modelscope/cli/pipeline.py b/modelscope/cli/pipeline.py index 793632e05..2b6f7951a 100644 --- a/modelscope/cli/pipeline.py +++ b/modelscope/cli/pipeline.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import logging import os from argparse import ArgumentParser from string import Template @@ -6,7 +7,7 @@ from modelscope.cli.base import CLICommand from modelscope.utils.logger import get_logger -logger = get_logger() +logger = get_logger(log_level=logging.WARNING) current_path = os.path.dirname(os.path.abspath(__file__)) template_path = os.path.join(current_path, 'template') diff --git a/modelscope/cli/server.py b/modelscope/cli/server.py new file mode 100644 index 000000000..17d6ca4d0 --- /dev/null +++ b/modelscope/cli/server.py @@ -0,0 +1,38 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import logging +import os +from argparse import ArgumentParser +from string import Template + +from modelscope.cli.base import CLICommand +from modelscope.server.api_server import add_server_args, run_server +from modelscope.utils.logger import get_logger + +logger = get_logger(log_level=logging.WARNING) + +current_path = os.path.dirname(os.path.abspath(__file__)) +template_path = os.path.join(current_path, 'template') + + +def subparser_func(args): + """ Function which will be called for a specific sub parser. + """ + return ServerCMD(args) + + +class ServerCMD(CLICommand): + name = 'server' + + def __init__(self, args): + self.args = args + + @staticmethod + def define_args(parsers: ArgumentParser): + """ define args for create pipeline template command. + """ + parser = parsers.add_parser(ServerCMD.name) + add_server_args(parser) + parser.set_defaults(func=subparser_func) + + def execute(self): + run_server(self.args) diff --git a/modelscope/exporters/__init__.py b/modelscope/exporters/__init__.py index 7fc094ac7..871ed3366 100644 --- a/modelscope/exporters/__init__.py +++ b/modelscope/exporters/__init__.py @@ -8,7 +8,7 @@ from .base import Exporter from .builder import build_exporter from .cv import CartoonTranslationExporter, FaceDetectionSCRFDExporter - from .multi_modal import StableDiffuisonExporter + from .multi_modal import StableDiffusionExporter from .nlp import (CsanmtForTranslationExporter, SbertForSequenceClassificationExporter, SbertForZeroShotClassificationExporter) @@ -19,7 +19,7 @@ 'base': ['Exporter'], 'builder': ['build_exporter'], 'cv': ['CartoonTranslationExporter', 'FaceDetectionSCRFDExporter'], - 'multi_modal': ['StableDiffuisonExporter'], + 'multi_modal': ['StableDiffusionExporter'], 'nlp': [ 'CsanmtForTranslationExporter', 'SbertForSequenceClassificationExporter', diff --git a/modelscope/exporters/multi_modal/__init__.py b/modelscope/exporters/multi_modal/__init__.py index ab565d1ca..f19b04f1c 100644 --- a/modelscope/exporters/multi_modal/__init__.py +++ b/modelscope/exporters/multi_modal/__init__.py @@ -5,10 +5,10 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .stable_diffusion_export import StableDiffuisonExporter + from .stable_diffusion_export import StableDiffusionExporter else: _import_structure = { - 'stable_diffusion_export': ['StableDiffuisonExporter'], + 'stable_diffusion_export': ['StableDiffusionExporter'], } import sys diff --git a/modelscope/exporters/multi_modal/stable_diffusion_exporter.py b/modelscope/exporters/multi_modal/stable_diffusion_exporter.py index 62ab0ce54..2c4319867 100644 --- a/modelscope/exporters/multi_modal/stable_diffusion_exporter.py +++ b/modelscope/exporters/multi_modal/stable_diffusion_exporter.py @@ -23,7 +23,7 @@ @EXPORTERS.register_module( Tasks.text_to_image_synthesis, module_name=Models.stable_diffusion) -class StableDiffuisonExporter(TorchModelExporter): +class StableDiffusionExporter(TorchModelExporter): @torch.no_grad() def export_onnx(self, diff --git a/modelscope/fileio/__init__.py b/modelscope/fileio/__init__.py index 385cd02c5..986affb27 100644 --- a/modelscope/fileio/__init__.py +++ b/modelscope/fileio/__init__.py @@ -1,4 +1,22 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from .file import File, LocalStorage -from .io import dump, dumps, load +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .file import File, LocalStorage + from .io import dump, dumps, load +else: + _import_structure = { + 'io': ['dump', 'dumps', 'load'], + 'file': ['File', 'LocalStorage'] + } + import sys + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/fileio/format/json.py b/modelscope/fileio/format/json.py index 660e342a2..cb26e58d6 100644 --- a/modelscope/fileio/format/json.py +++ b/modelscope/fileio/format/json.py @@ -1,11 +1,9 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -import numpy as np - -from . import jsonplus from .base import FormatHandler def set_default(obj): + import numpy as np """Set default json values for non-serializable values. It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list. @@ -25,10 +23,13 @@ class JsonHandler(FormatHandler): """Use jsonplus, serialization of Python types to JSON that "just works".""" def load(self, file): + from . import jsonplus return jsonplus.loads(file.read()) def dump(self, obj, file, **kwargs): + from . import jsonplus file.write(self.dumps(obj, **kwargs)) def dumps(self, obj, **kwargs): + from . import jsonplus return jsonplus.dumps(obj, **kwargs) diff --git a/modelscope/fileio/format/jsonplus.py b/modelscope/fileio/format/jsonplus.py index af59caeb0..48a4d512a 100644 --- a/modelscope/fileio/format/jsonplus.py +++ b/modelscope/fileio/format/jsonplus.py @@ -9,7 +9,7 @@ import threading import uuid from collections import namedtuple -from datetime import date, datetime, time, timedelta +from datetime import timedelta from dateutil.parser import parse as parse_datetime from decimal import Decimal from fractions import Fraction diff --git a/modelscope/hub/api.py b/modelscope/hub/api.py index f83defd0e..a0d977125 100644 --- a/modelscope/hub/api.py +++ b/modelscope/hub/api.py @@ -15,13 +15,15 @@ from http.cookiejar import CookieJar from os.path import expanduser from typing import Dict, List, Optional, Tuple, Union +from urllib.parse import urlencode -import pandas as pd +import json import requests from requests import Session from requests.adapters import HTTPAdapter, Retry -from modelscope.hub.constants import (API_HTTP_CLIENT_TIMEOUT, +from modelscope.hub.constants import (API_HTTP_CLIENT_MAX_RETRIES, + API_HTTP_CLIENT_TIMEOUT, API_RESPONSE_FIELD_DATA, API_RESPONSE_FIELD_EMAIL, API_RESPONSE_FIELD_GIT_ACCESS_TOKEN, @@ -31,7 +33,8 @@ MODELSCOPE_CLOUD_ENVIRONMENT, MODELSCOPE_CLOUD_USERNAME, MODELSCOPE_REQUEST_ID, ONE_YEAR_SECONDS, - REQUESTS_API_HTTP_METHOD, Licenses, + REQUESTS_API_HTTP_METHOD, + DatasetVisibility, Licenses, ModelVisibility) from modelscope.hub.errors import (InvalidParameter, NotExistError, NotLoginException, NoValidRevisionError, @@ -45,13 +48,14 @@ DEFAULT_MODEL_REVISION, DEFAULT_REPOSITORY_REVISION, MASTER_MODEL_BRANCH, META_FILES_FORMAT, + REPO_TYPE_MODEL, ConfigFields, DatasetFormations, DatasetMetaFormats, DatasetVisibilityMap, DownloadChannel, - DownloadMode, ModelFile, - VirgoDatasetConfig) + DownloadMode, Frameworks, ModelFile, + Tasks, VirgoDatasetConfig) from modelscope.utils.logger import get_logger -from .utils.utils import (get_endpoint, get_release_datetime, - model_id_to_group_owner_name) +from .utils.utils import (get_endpoint, get_readable_folder_size, + get_release_datetime, model_id_to_group_owner_name) logger = get_logger() @@ -59,7 +63,10 @@ class HubApi: """Model hub api interface. """ - def __init__(self, endpoint: Optional[str] = None): + def __init__(self, + endpoint: Optional[str] = None, + timeout=API_HTTP_CLIENT_TIMEOUT, + max_retries=API_HTTP_CLIENT_MAX_RETRIES): """The ModelScope HubApi。 Args: @@ -69,11 +76,12 @@ def __init__(self, endpoint: Optional[str] = None): self.headers = {'user-agent': ModelScopeConfig.get_user_agent()} self.session = Session() retry = Retry( - total=2, + total=max_retries, read=2, connect=2, backoff_factor=1, status_forcelist=(500, 502, 503, 504), + respect_retry_after_header=False, ) adapter = HTTPAdapter(max_retries=retry) self.session.mount('http://', adapter) @@ -84,12 +92,12 @@ def __init__(self, endpoint: Optional[str] = None): self.session, method, functools.partial( getattr(self.session, method), - timeout=API_HTTP_CLIENT_TIMEOUT)) + timeout=timeout)) def login( self, access_token: str, - ) -> tuple(): + ): """Login with your SDK access token, which can be obtained from https://www.modelscope.cn user center. @@ -241,6 +249,58 @@ def get_model( else: raise_for_http_status(r) + def repo_exists( + self, + repo_id: str, + *, + repo_type: Optional[str] = None, + ) -> bool: + """ + Checks if a repository exists on ModelScope + + Args: + repo_id (`str`): + A namespace (user or an organization) and a repo name separated + by a `/`. + repo_type (`str`, *optional*): + `None` or `"model"` if getting repository info from a model. Default is `None`. + TODO: support dataset and studio + + Returns: + True if the repository exists, False otherwise. + """ + if (repo_type is not None) and repo_type.lower != REPO_TYPE_MODEL: + raise Exception('Not support repo-type: %s' % repo_type) + if (repo_id is None) or repo_id.count('/') != 1: + raise Exception('Invalid repo_id: %s, must be of format namespace/name' % repo_type) + + cookies = ModelScopeConfig.get_cookies() + owner_or_group, name = model_id_to_group_owner_name(repo_id) + path = f'{self.endpoint}/api/v1/models/{owner_or_group}/{name}' + + r = self.session.get(path, cookies=cookies, + headers=self.builder_headers(self.headers)) + code = handle_http_response(r, logger, cookies, repo_id, False) + if code == 200: + return True + elif code == 404: + return False + else: + logger.warn(f'Check repo_exists return status code {code}.') + raise Exception( + 'Failed to check existence of repo: %s, make sure you have access authorization.' + % repo_type) + + @staticmethod + def _create_default_config(model_dir): + cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) + cfg = { + ConfigFields.framework: Frameworks.torch, + ConfigFields.task: Tasks.other, + } + with open(cfg_file, 'w') as file: + json.dump(cfg, file) + def push_model(self, model_id: str, model_dir: str, @@ -264,6 +324,8 @@ def push_model(self, This function must be called before calling HubApi's login with a valid token which can be obtained from ModelScope's website. + If any error, please upload via git commands. + Args: model_id (str): The model id to be uploaded, caller must have write permission for it. @@ -306,23 +368,23 @@ def push_model(self, raise InvalidParameter('model_dir must be a valid directory.') cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION) if not os.path.exists(cfg_file): - raise ValueError(f'{model_dir} must contain a configuration.json.') + logger.warning( + f'No {ModelFile.CONFIGURATION} file found in {model_dir}, creating a default one.') + HubApi._create_default_config(model_dir) + cookies = ModelScopeConfig.get_cookies() if cookies is None: raise NotLoginException('Must login before upload!') files_to_save = os.listdir(model_dir) + folder_size = get_readable_folder_size(model_dir) if ignore_file_pattern is None: ignore_file_pattern = [] if isinstance(ignore_file_pattern, str): ignore_file_pattern = [ignore_file_pattern] - try: - self.get_model(model_id=model_id) - except Exception: - if visibility is None or license is None: - raise InvalidParameter( - 'visibility and license cannot be empty if want to create new repo' - ) - logger.info('Create new model %s' % model_id) + if visibility is None or license is None: + raise InvalidParameter('Visibility and License cannot be empty for new model.') + if not self.repo_exists(model_id): + logger.info('Creating new model [%s]' % model_id) self.create_model( model_id=model_id, visibility=visibility, @@ -331,11 +393,13 @@ def push_model(self, original_model_id=original_model_id) tmp_dir = tempfile.mkdtemp() git_wrapper = GitCommandWrapper() + logger.info(f'Pushing folder {model_dir} as model {model_id}.') + logger.info(f'Total folder size {folder_size}, this may take a while depending on actual pushing size...') try: repo = Repository(model_dir=tmp_dir, clone_from=model_id) branches = git_wrapper.get_remote_branches(tmp_dir) if revision not in branches: - logger.info('Create new branch %s' % revision) + logger.info('Creating new branch %s' % revision) git_wrapper.new_branch(tmp_dir, revision) git_wrapper.checkout(tmp_dir, revision) files_in_repo = os.listdir(tmp_dir) @@ -399,7 +463,7 @@ def list_models(self, (owner_or_group, page_number, page_size), cookies=cookies, headers=self.builder_headers(self.headers)) - handle_http_response(r, logger, cookies, 'list_model') + handle_http_response(r, logger, cookies, owner_or_group) if r.status_code == HTTPStatus.OK: if is_ok(r.json()): data = r.json()[API_RESPONSE_FIELD_DATA] @@ -429,6 +493,30 @@ def list_model_revisions( use_cookies: Union[bool, CookieJar] = False) -> List[str]: """Get model branch and tags. + Args: + model_id (str): The model id + cutoff_timestamp (int): Tags created before the cutoff will be included. + The timestamp is represented by the seconds elapsed from the epoch time. + use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, + will load cookie from local. Defaults to False. + + Returns: + Tuple[List[str], List[str]]: Return list of branch name and tags + """ + tags_details = self.list_model_revisions_detail(model_id=model_id, + cutoff_timestamp=cutoff_timestamp, + use_cookies=use_cookies) + tags = [x['Revision'] for x in tags_details + ] if tags_details else [] + return tags + + def list_model_revisions_detail( + self, + model_id: str, + cutoff_timestamp: Optional[int] = None, + use_cookies: Union[bool, CookieJar] = False) -> List[str]: + """Get model branch and tags. + Args: model_id (str): The model id cutoff_timestamp (int): Tags created before the cutoff will be included. @@ -450,65 +538,89 @@ def list_model_revisions( raise_on_error(d) info = d[API_RESPONSE_FIELD_DATA] # tags returned from backend are guaranteed to be ordered by create-time - tags = [x['Revision'] for x in info['RevisionMap']['Tags'] - ] if info['RevisionMap']['Tags'] else [] - return tags + return info['RevisionMap']['Tags'] - def get_valid_revision(self, - model_id: str, - revision=None, - cookies: Optional[CookieJar] = None): + def get_branch_tag_detail(self, details, name): + for item in details: + if item['Revision'] == name: + return item + return None + + def get_valid_revision_detail(self, + model_id: str, + revision=None, + cookies: Optional[CookieJar] = None): release_timestamp = get_release_datetime() current_timestamp = int(round(datetime.datetime.now().timestamp())) # for active development in library codes (non-release-branches), release_timestamp # is set to be a far-away-time-in-the-future, to ensure that we shall # get the master-HEAD version from model repo by default (when no revision is provided) + all_branches_detail, all_tags_detail = self.get_model_branches_and_tags_details( + model_id, use_cookies=False if cookies is None else cookies) + all_branches = [x['Revision'] for x in all_branches_detail] if all_branches_detail else [] + all_tags = [x['Revision'] for x in all_tags_detail] if all_tags_detail else [] if release_timestamp > current_timestamp + ONE_YEAR_SECONDS: - branches, tags = self.get_model_branches_and_tags( - model_id, use_cookies=False if cookies is None else cookies) if revision is None: revision = MASTER_MODEL_BRANCH logger.info( 'Model revision not specified, use default: %s in development mode' % revision) - if revision not in branches and revision not in tags: + if revision not in all_branches and revision not in all_tags: raise NotExistError('The model: %s has no revision : %s .' % (model_id, revision)) - logger.info('Development mode use revision: %s' % revision) + + revision_detail = self.get_branch_tag_detail(all_tags_detail, revision) + if revision_detail is None: + revision_detail = self.get_branch_tag_detail(all_branches_detail, revision) + logger.debug('Development mode use revision: %s' % revision) else: - all_revisions = self.list_model_revisions( - model_id, - cutoff_timestamp=current_timestamp, - use_cookies=False if cookies is None else cookies) - if len(all_revisions) == 0: + if revision is not None and revision in all_branches: + revision_detail = self.get_branch_tag_detail(all_branches_detail, revision) + logger.warning('Using branch: %s as version is unstable, use with caution' % revision) + return revision_detail + + if len(all_tags_detail) == 0: # use no revision use master as default. if revision is None or revision == MASTER_MODEL_BRANCH: revision = MASTER_MODEL_BRANCH else: raise NotExistError('The model: %s has no revision: %s !' % (model_id, revision)) + revision_detail = self.get_branch_tag_detail(all_branches_detail, revision) else: if revision is None: # user not specified revision, use latest revision before release time - revisions = self.list_model_revisions( - model_id, - cutoff_timestamp=release_timestamp, - use_cookies=False if cookies is None else cookies) - if len(revisions) > 0: - revision = revisions[0] # use latest revision before release time. + revisions_detail = [x for x in + all_tags_detail if x['CreatedAt'] <= release_timestamp] if all_tags_detail else [] # noqa E501 + if len(revisions_detail) > 0: + revision = revisions_detail[0]['Revision'] # use latest revision before release time. + revision_detail = revisions_detail[0] else: - vl = '[%s]' % ','.join(all_revisions) - raise NoValidRevisionError('Model revision should be specified from revisions: %s' % (vl)) + revision = MASTER_MODEL_BRANCH + revision_detail = self.get_branch_tag_detail(all_branches_detail, revision) + vl = '[%s]' % ','.join(all_tags) + logger.warning('Model revision should be specified from revisions: %s' % (vl)) logger.warning('Model revision not specified, use revision: %s' % revision) else: # use user-specified revision - if revision not in all_revisions: + if revision not in all_tags: if revision == MASTER_MODEL_BRANCH: logger.warning('Using the master branch is fragile, please use it with caution!') + revision_detail = self.get_branch_tag_detail(all_branches_detail, revision) else: - vl = '[%s]' % ','.join(all_revisions) + vl = '[%s]' % ','.join(all_tags) raise NotExistError('The model: %s has no revision: %s valid are: %s!' % (model_id, revision, vl)) + else: + revision_detail = self.get_branch_tag_detail(all_tags_detail, revision) logger.info('Use user-specified model revision: %s' % revision) - return revision + return revision_detail - def get_model_branches_and_tags( + def get_valid_revision(self, + model_id: str, + revision=None, + cookies: Optional[CookieJar] = None): + return self.get_valid_revision_detail(model_id=model_id, + revision=revision, + cookies=cookies)['Revision'] + + def get_model_branches_and_tags_details( self, model_id: str, use_cookies: Union[bool, CookieJar] = False, @@ -532,10 +644,29 @@ def get_model_branches_and_tags( d = r.json() raise_on_error(d) info = d[API_RESPONSE_FIELD_DATA] - branches = [x['Revision'] for x in info['RevisionMap']['Branches'] - ] if info['RevisionMap']['Branches'] else [] - tags = [x['Revision'] for x in info['RevisionMap']['Tags'] - ] if info['RevisionMap']['Tags'] else [] + return info['RevisionMap']['Branches'], info['RevisionMap']['Tags'] + + def get_model_branches_and_tags( + self, + model_id: str, + use_cookies: Union[bool, CookieJar] = False, + ) -> Tuple[List[str], List[str]]: + """Get model branch and tags. + + Args: + model_id (str): The model id + use_cookies (Union[bool, CookieJar], optional): If is cookieJar, we will use this cookie, if True, + will load cookie from local. Defaults to False. + + Returns: + Tuple[List[str], List[str]]: Return list of branch name and tags + """ + branches_detail, tags_detail = self.get_model_branches_and_tags_details(model_id=model_id, + use_cookies=use_cookies) + branches = [x['Revision'] for x in branches_detail + ] if branches_detail else [] + tags = [x['Revision'] for x in tags_detail + ] if tags_detail else [] return branches, tags def get_model_files(self, @@ -585,6 +716,64 @@ def get_model_files(self, files.append(file) return files + def file_exists( + self, + repo_id: str, + filename: str, + *, + revision: Optional[str] = None, + ): + """Get if the specified file exists + + Args: + repo_id (`str`): The repo id to use + filename (`str`): The queried filename + revision (`Optional[str]`): The repo revision + Returns: + The query result in bool value + """ + files = self.get_model_files(repo_id, revision=revision) + files = [file['Name'] for file in files] + return filename in files + + def create_dataset(self, + dataset_name: str, + namespace: str, + chinese_name: Optional[str] = '', + license: Optional[str] = Licenses.APACHE_V2, + visibility: Optional[int] = DatasetVisibility.PUBLIC, + description: Optional[str] = '') -> str: + + if dataset_name is None or namespace is None: + raise InvalidParameter('dataset_name and namespace are required!') + + cookies = ModelScopeConfig.get_cookies() + if cookies is None: + raise ValueError('Token does not exist, please login first.') + + path = f'{self.endpoint}/api/v1/datasets' + files = { + 'Name': (None, dataset_name), + 'ChineseName': (None, chinese_name), + 'Owner': (None, namespace), + 'License': (None, license), + 'Visibility': (None, visibility), + 'Description': (None, description) + } + + r = self.session.post( + path, + files=files, + cookies=cookies, + headers=self.builder_headers(self.headers), + ) + + handle_http_post_error(r, path, files) + raise_on_error(r.json()) + dataset_repo_url = f'{self.endpoint}/datasets/{namespace}/{dataset_name}' + logger.info(f'Create dataset success: {dataset_repo_url}') + return dataset_repo_url + def list_datasets(self): path = f'{self.endpoint}/api/v1/datasets' params = {} @@ -600,11 +789,56 @@ def get_dataset_id_and_type(self, dataset_name: str, namespace: str): cookies = ModelScopeConfig.get_cookies() r = self.session.get(datahub_url, cookies=cookies) resp = r.json() - datahub_raise_on_error(datahub_url, resp) + datahub_raise_on_error(datahub_url, resp, r) dataset_id = resp['Data']['Id'] dataset_type = resp['Data']['Type'] return dataset_id, dataset_type + def get_dataset_infos(self, + dataset_hub_id: str, + revision: str, + files_metadata: bool = False, + timeout: float = 100, + recursive: str = 'True'): + """ + Get dataset infos. + """ + datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree' + params = {'Revision': revision, 'Root': None, 'Recursive': recursive} + cookies = ModelScopeConfig.get_cookies() + if files_metadata: + params['blobs'] = True + r = self.session.get(datahub_url, params=params, cookies=cookies, timeout=timeout) + resp = r.json() + datahub_raise_on_error(datahub_url, resp, r) + + return resp + + def list_repo_tree(self, + dataset_name: str, + namespace: str, + revision: str, + root_path: str, + recursive: bool = True, + page_number: int = 1, + page_size: int = 100): + + dataset_hub_id, dataset_type = self.get_dataset_id_and_type( + dataset_name=dataset_name, namespace=namespace) + + recursive = 'True' if recursive else 'False' + datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_hub_id}/repo/tree' + params = {'Revision': revision if revision else 'master', + 'Root': root_path if root_path else '/', 'Recursive': recursive, + 'PageNumber': page_number, 'PageSize': page_size} + cookies = ModelScopeConfig.get_cookies() + + r = self.session.get(datahub_url, params=params, cookies=cookies) + resp = r.json() + datahub_raise_on_error(datahub_url, resp, r) + + return resp + def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_id: str, revision: str): """ Get the meta file-list of the dataset. """ datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}' @@ -613,7 +847,7 @@ def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_ cookies=cookies, headers=self.builder_headers(self.headers)) resp = r.json() - datahub_raise_on_error(datahub_url, resp) + datahub_raise_on_error(datahub_url, resp, r) file_list = resp['Data'] if file_list is None: raise NotExistError( @@ -626,8 +860,10 @@ def get_dataset_meta_file_list(self, dataset_name: str, namespace: str, dataset_ @staticmethod def dump_datatype_file(dataset_type: int, meta_cache_dir: str): """ - Dump the data_type as a local file, in order to get the dataset formation without calling the datahub. - More details, please refer to the class `modelscope.utils.constant.DatasetFormations`. + Dump the data_type as a local file, in order to get the dataset + formation without calling the datahub. + More details, please refer to the class + `modelscope.utils.constant.DatasetFormations`. """ dataset_type_file_path = os.path.join(meta_cache_dir, f'{str(dataset_type)}{DatasetFormations.formation_mark_ext.value}') @@ -673,8 +909,9 @@ def fetch_meta_files_from_url(url, out_path, chunk_size=1024, mode=DownloadMode. Fetch the meta-data files from the url, e.g. csv/jsonl files. """ import hashlib - import json - from tqdm import tqdm + from tqdm.auto import tqdm + import pandas as pd + out_path = os.path.join(out_path, hashlib.md5(url.encode(encoding='UTF-8')).hexdigest()) if mode == DownloadMode.FORCE_REDOWNLOAD and os.path.exists(out_path): os.remove(out_path) @@ -712,7 +949,7 @@ def get_chunk(resp): else: with_header = False chunk_df = pd.DataFrame(chunk) - chunk_df.to_csv(f, index=False, header=with_header) + chunk_df.to_csv(f, index=False, header=with_header, escapechar='\\') iter_num += 1 else: # csv or others @@ -723,6 +960,35 @@ def get_chunk(resp): return out_path def get_dataset_file_url( + self, + file_name: str, + dataset_name: str, + namespace: str, + revision: Optional[str] = DEFAULT_DATASET_REVISION, + view: Optional[bool] = False, + extension_filter: Optional[bool] = True): + + if not file_name or not dataset_name or not namespace: + raise ValueError('Args (file_name, dataset_name, namespace) cannot be empty!') + + # Note: make sure the FilePath is the last parameter in the url + params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': file_name, 'View': view} + params: str = urlencode(params) + file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?{params}' + + return file_url + + # if extension_filter: + # if os.path.splitext(file_name)[-1] in META_FILES_FORMAT: + # file_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?'\ + # f'Revision={revision}&FilePath={file_name}' + # else: + # file_url = file_name + # return file_url + # else: + # return file_url + + def get_dataset_file_url_origin( self, file_name: str, dataset_name: str, @@ -866,10 +1132,10 @@ def datahub_remote_call(self, url): cookies=cookies, headers={'user-agent': ModelScopeConfig.get_user_agent()}) resp = r.json() - datahub_raise_on_error(url, resp) + datahub_raise_on_error(url, resp, r) return resp['Data'] - def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool) -> None: + def dataset_download_statistics(self, dataset_name: str, namespace: str, use_streaming: bool = False) -> None: is_ci_test = os.getenv('CI_TEST') == 'True' if dataset_name and namespace and not is_ci_test and not use_streaming: try: @@ -902,6 +1168,10 @@ def builder_headers(self, headers): return {MODELSCOPE_REQUEST_ID: str(uuid.uuid4().hex), **headers} + def get_file_base_path(self, namespace: str, dataset_name: str) -> str: + return f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' + # return f'{endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?Revision={revision}&FilePath=' + class ModelScopeConfig: path_credential = expanduser(DEFAULT_CREDENTIALS_PATH) @@ -909,6 +1179,7 @@ class ModelScopeConfig: GIT_TOKEN_FILE_NAME = 'git_token' USER_INFO_FILE_NAME = 'user' USER_SESSION_ID_FILE_NAME = 'session' + cookie_expired_warning = False @staticmethod def make_sure_credential_path_exist(): @@ -930,10 +1201,12 @@ def get_cookies(): with open(cookies_path, 'rb') as f: cookies = pickle.load(f) for cookie in cookies: - if cookie.is_expired(): - logger.warning( + if cookie.is_expired() and not ModelScopeConfig.cookie_expired_warning: + ModelScopeConfig.cookie_expired_warning = True + logger.debug( 'Authentication has expired, ' - 'please re-login if you need to access private models or datasets.') + 'please re-login with modelscope login --token "YOUR_SDK_TOKEN" ' + 'if you need to access private models or datasets.') return None return cookies return None diff --git a/modelscope/hub/check_model.py b/modelscope/hub/check_model.py index ab1500dbd..59a77bfee 100644 --- a/modelscope/hub/check_model.py +++ b/modelscope/hub/check_model.py @@ -48,7 +48,7 @@ def check_local_model_is_latest( 'Snapshot': 'True' } } - _api = HubApi() + _api = HubApi(timeout=0.5) try: _, revisions = _api.get_model_branches_and_tags( model_id=model_id, use_cookies=cookies) diff --git a/modelscope/hub/constants.py b/modelscope/hub/constants.py index 3ebc167d7..b3d03e1ae 100644 --- a/modelscope/hub/constants.py +++ b/modelscope/hub/constants.py @@ -17,10 +17,11 @@ DEFAULT_CREDENTIALS_PATH = Path.home().joinpath('.modelscope', 'credentials') REQUESTS_API_HTTP_METHOD = ['get', 'head', 'post', 'put', 'patch', 'delete'] API_HTTP_CLIENT_TIMEOUT = 60 +API_HTTP_CLIENT_MAX_RETRIES = 2 API_RESPONSE_FIELD_DATA = 'Data' API_FILE_DOWNLOAD_RETRY_TIMES = 5 -API_FILE_DOWNLOAD_TIMEOUT = 30 -API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 16 +API_FILE_DOWNLOAD_TIMEOUT = 60 +API_FILE_DOWNLOAD_CHUNK_SIZE = 1024 * 1024 * 1 API_RESPONSE_FIELD_GIT_ACCESS_TOKEN = 'AccessToken' API_RESPONSE_FIELD_USERNAME = 'Username' API_RESPONSE_FIELD_EMAIL = 'Email' @@ -28,10 +29,10 @@ MODELSCOPE_CLOUD_ENVIRONMENT = 'MODELSCOPE_ENVIRONMENT' MODELSCOPE_CLOUD_USERNAME = 'MODELSCOPE_USERNAME' MODELSCOPE_SDK_DEBUG = 'MODELSCOPE_SDK_DEBUG' +MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION = 'MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION' ONE_YEAR_SECONDS = 24 * 365 * 60 * 60 -MODEL_META_FILE_NAME = '.mdl' -MODEL_META_MODEL_ID = 'id' MODELSCOPE_REQUEST_ID = 'X-Request-ID' +TEMPORARY_FOLDER_NAME = '._____temp' class Licenses(object): @@ -49,3 +50,9 @@ class ModelVisibility(object): PRIVATE = 1 INTERNAL = 3 PUBLIC = 5 + + +class DatasetVisibility(object): + PRIVATE = 1 + INTERNAL = 3 + PUBLIC = 5 diff --git a/modelscope/hub/errors.py b/modelscope/hub/errors.py index 48bb5fe0c..986425d2d 100644 --- a/modelscope/hub/errors.py +++ b/modelscope/hub/errors.py @@ -1,6 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import logging from http import HTTPStatus +from typing import Optional import requests from requests.exceptions import HTTPError @@ -8,7 +10,7 @@ from modelscope.hub.constants import MODELSCOPE_REQUEST_ID from modelscope.utils.logger import get_logger -logger = get_logger() +logger = get_logger(log_level=logging.WARNING) class NotSupportError(Exception): @@ -85,18 +87,41 @@ def handle_http_post_error(response, url, request_body): (url, request_body, message, get_request_id(response))) from error -def handle_http_response(response: requests.Response, logger, cookies, - model_id): - try: - response.raise_for_status() - except HTTPError as error: - if cookies is None: # code in [403] and - logger.error( - f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \ - private. Please login first.') - message = _decode_response_error(response) - raise HTTPError('Response details: %s, Request id: %s' % - (message, get_request_id(response))) from error +def handle_http_response(response: requests.Response, + logger, + cookies, + model_id, + raise_on_error: Optional[bool] = True) -> int: + http_error_msg = '' + if isinstance(response.reason, bytes): + try: + reason = response.reason.decode('utf-8') + except UnicodeDecodeError: + reason = response.reason.decode('iso-8859-1') + else: + reason = response.reason + request_id = get_request_id(response) + if 404 == response.status_code: + http_error_msg = 'The request model: %s does not exist!' % (model_id) + elif 403 == response.status_code: + if cookies is None: + http_error_msg = 'Authentication token does not exist, ' + 'failed to access model {model_id} which may not exist or may be ' + 'private. Please login first.' + else: + http_error_msg = 'The authentication token is invalid, failed to access model {model_id}.' + elif 400 <= response.status_code < 500: + http_error_msg = u'%s Client Error: %s, Request id: %s for url: %s' % ( + response.status_code, reason, request_id, response.url) + + elif 500 <= response.status_code < 600: + http_error_msg = u'%s Server Error: %s, Request id: %s, for url: %s' % ( + response.status_code, reason, request_id, response.url) + if http_error_msg and raise_on_error: # there is error. + logger.error(http_error_msg) + raise HTTPError(http_error_msg, response=response) + else: + return response.status_code def raise_on_error(rsp): @@ -117,12 +142,13 @@ def raise_on_error(rsp): raise RequestError(rsp['Message']) -def datahub_raise_on_error(url, rsp): +def datahub_raise_on_error(url, rsp, http_response: requests.Response): """If response error, raise exception Args: url (str): The request url rsp (HTTPResponse): The server response. + http_response: the origin http response. Raises: RequestError: the http request error. @@ -133,9 +159,9 @@ def datahub_raise_on_error(url, rsp): if rsp.get('Code') == HTTPStatus.OK: return True else: - request_id = get_request_id(rsp) + request_id = rsp['RequestId'] raise RequestError( - f"Url = {url}, Request id={request_id} Message = {rsp.get('Message')},\ + f"Url = {url}, Request id={request_id} Code = {rsp['Code']} Message = {rsp['Message']},\ Please specify correct dataset_name and namespace.") @@ -159,7 +185,12 @@ def raise_for_http_status(rsp): else: reason = rsp.reason request_id = get_request_id(rsp) - if 400 <= rsp.status_code < 500: + if 404 == rsp.status_code: + http_error_msg = 'The request resource(model or dataset) does not exist!,' + 'url: %s, reason: %s' % (rsp.url, reason) + elif 403 == rsp.status_code: + http_error_msg = 'Authentication token does not exist or invalid.' + elif 400 <= rsp.status_code < 500: http_error_msg = u'%s Client Error: %s, Request id: %s for url: %s' % ( rsp.status_code, reason, request_id, rsp.url) diff --git a/modelscope/hub/file_download.py b/modelscope/hub/file_download.py index c37b716ad..40ac8a038 100644 --- a/modelscope/hub/file_download.py +++ b/modelscope/hub/file_download.py @@ -1,9 +1,10 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import copy +import io import os import tempfile -import threading +import urllib import uuid from concurrent.futures import ThreadPoolExecutor from functools import partial @@ -13,19 +14,24 @@ import requests from requests.adapters import Retry -from tqdm import tqdm +from tqdm.auto import tqdm from modelscope.hub.api import HubApi, ModelScopeConfig from modelscope.hub.constants import ( API_FILE_DOWNLOAD_CHUNK_SIZE, API_FILE_DOWNLOAD_RETRY_TIMES, API_FILE_DOWNLOAD_TIMEOUT, FILE_HASH, MODELSCOPE_DOWNLOAD_PARALLELS, - MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB) -from modelscope.utils.constant import DEFAULT_MODEL_REVISION + MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB, TEMPORARY_FOLDER_NAME) +from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, + DEFAULT_MODEL_REVISION, + REPO_TYPE_DATASET, REPO_TYPE_MODEL, + REPO_TYPE_SUPPORT) +from modelscope.utils.file_utils import (get_dataset_cache_root, + get_model_cache_root) from modelscope.utils.logger import get_logger -from .errors import FileDownloadError, NotExistError +from .errors import FileDownloadError, InvalidParameter, NotExistError from .utils.caching import ModelFileSystemCache -from .utils.utils import (file_integrity_validation, get_cache_dir, - get_endpoint, model_id_to_group_owner_name) +from .utils.utils import (file_integrity_validation, get_endpoint, + model_id_to_group_owner_name) logger = get_logger() @@ -38,6 +44,7 @@ def model_file_download( user_agent: Union[Dict, str, None] = None, local_files_only: Optional[bool] = False, cookies: Optional[CookieJar] = None, + local_dir: Optional[str] = None, ) -> Optional[str]: # pragma: no cover """Download from a given URL and cache it if it's not already present in the local cache. @@ -55,6 +62,7 @@ def model_file_download( local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the local cached file if it exists. if `False`, download the file anyway even it exists. cookies (CookieJar, optional): The cookie of download request. + local_dir (str, optional): Specific local directory path to which the file will be downloaded. Returns: string: string of local file or if networking is off, last version of @@ -74,16 +82,97 @@ def model_file_download( - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) if some parameter value is invalid """ - if cache_dir is None: - cache_dir = get_cache_dir() - if isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - temporary_cache_dir = os.path.join(cache_dir, 'temp') - os.makedirs(temporary_cache_dir, exist_ok=True) + return _repo_file_download( + model_id, + file_path, + repo_type=REPO_TYPE_MODEL, + revision=revision, + cache_dir=cache_dir, + user_agent=user_agent, + local_files_only=local_files_only, + cookies=cookies, + local_dir=local_dir) - group_or_owner, name = model_id_to_group_owner_name(model_id) - cache = ModelFileSystemCache(cache_dir, group_or_owner, name) +def dataset_file_download( + dataset_id: str, + file_path: str, + revision: Optional[str] = DEFAULT_DATASET_REVISION, + cache_dir: Union[str, Path, None] = None, + local_dir: Optional[str] = None, + user_agent: Optional[Union[Dict, str]] = None, + local_files_only: Optional[bool] = False, + cookies: Optional[CookieJar] = None, +) -> str: + """Download raw files of a dataset. + Downloads all files at the specified revision. This + is useful when you want all files from a dataset, because you don't know which + ones you will need a priori. All files are nested inside a folder in order + to keep their actual filename relative to that folder. + + An alternative would be to just clone a dataset but this would require that the + user always has git and git-lfs installed, and properly configured. + + Args: + dataset_id (str): A user or an organization name and a dataset name separated by a `/`. + file_path (str): The relative path of the file to download. + revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a + commit hash. NOTE: currently only branch and tag name is supported + cache_dir (str, Path, optional): Path to the folder where cached files are stored, dataset file will + be save as cache_dir/dataset_id/THE_DATASET_FILES. + local_dir (str, optional): Specific local directory path to which the file will be downloaded. + user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string. + local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the + local cached file if it exists. + cookies (CookieJar, optional): The cookie of the request, default None. + Raises: + ValueError: the value details. + + Returns: + str: Local folder path (string) of repo snapshot + + Note: + Raises the following errors: + - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) + if `use_auth_token=True` and the token cannot be found. + - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if + ETag cannot be determined. + - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + if some parameter value is invalid + """ + return _repo_file_download( + dataset_id, + file_path, + repo_type=REPO_TYPE_DATASET, + revision=revision, + cache_dir=cache_dir, + user_agent=user_agent, + local_files_only=local_files_only, + cookies=cookies, + local_dir=local_dir) + + +def _repo_file_download( + repo_id: str, + file_path: str, + *, + repo_type: str = None, + revision: Optional[str] = DEFAULT_MODEL_REVISION, + cache_dir: Optional[str] = None, + user_agent: Union[Dict, str, None] = None, + local_files_only: Optional[bool] = False, + cookies: Optional[CookieJar] = None, + local_dir: Optional[str] = None, +) -> Optional[str]: # pragma: no cover + + if not repo_type: + repo_type = REPO_TYPE_MODEL + if repo_type not in REPO_TYPE_SUPPORT: + raise InvalidParameter('Invalid repo type: %s, only support: %s' % + (repo_type, REPO_TYPE_SUPPORT)) + + temporary_cache_dir, cache = create_temporary_directory_and_cache( + repo_id, local_dir=local_dir, cache_dir=cache_dir, repo_type=repo_type) # if local_files_only is `True` and the file already exists in cached_path # return the cached path @@ -97,7 +186,7 @@ def model_file_download( else: raise ValueError( 'Cannot find the requested files in the cached path and outgoing' - ' traffic has been disabled. To enable model look-ups and downloads' + ' traffic has been disabled. To enable look-ups and downloads' " online, set 'local_files_only' to False.") _api = HubApi() @@ -106,64 +195,115 @@ def model_file_download( } if cookies is None: cookies = ModelScopeConfig.get_cookies() + repo_files = [] + file_to_download_meta = None + if repo_type == REPO_TYPE_MODEL: + revision = _api.get_valid_revision( + repo_id, revision=revision, cookies=cookies) + # we need to confirm the version is up-to-date + # we need to get the file list to check if the latest version is cached, if so return, otherwise download + repo_files = _api.get_model_files( + model_id=repo_id, + revision=revision, + recursive=True, + use_cookies=False if cookies is None else cookies) + for repo_file in repo_files: + if repo_file['Type'] == 'tree': + continue + + if repo_file['Path'] == file_path: + if cache.exists(repo_file): + logger.debug( + f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!' + ) + return cache.get_file_by_info(repo_file) + else: + file_to_download_meta = repo_file + break + elif repo_type == REPO_TYPE_DATASET: + group_or_owner, name = model_id_to_group_owner_name(repo_id) + if not revision: + revision = DEFAULT_DATASET_REVISION + page_number = 1 + page_size = 100 + while True: + files_list_tree = _api.list_repo_tree( + dataset_name=name, + namespace=group_or_owner, + revision=revision, + root_path='/', + recursive=True, + page_number=page_number, + page_size=page_size) + if not ('Code' in files_list_tree + and files_list_tree['Code'] == 200): + print( + 'Get dataset: %s file list failed, request_id: %s, message: %s' + % (repo_id, files_list_tree['RequestId'], + files_list_tree['Message'])) + return None + repo_files = files_list_tree['Data']['Files'] + is_exist = False + for repo_file in repo_files: + if repo_file['Type'] == 'tree': + continue + + if repo_file['Path'] == file_path: + if cache.exists(repo_file): + logger.debug( + f'File {repo_file["Name"]} already in cache with identical hash, skip downloading!' + ) + return cache.get_file_by_info(repo_file) + else: + file_to_download_meta = repo_file + is_exist = True + break + if len(repo_files) < page_size or is_exist: + break + page_number += 1 - revision = _api.get_valid_revision( - model_id, revision=revision, cookies=cookies) - file_to_download_info = None - # we need to confirm the version is up-to-date - # we need to get the file list to check if the latest version is cached, if so return, otherwise download - model_files = _api.get_model_files( - model_id=model_id, - revision=revision, - recursive=True, - use_cookies=False if cookies is None else cookies) - - for model_file in model_files: - if model_file['Type'] == 'tree': - continue - - if model_file['Path'] == file_path: - if cache.exists(model_file): - logger.debug( - f'File {model_file["Name"]} already in cache, skip downloading!' - ) - return cache.get_file_by_info(model_file) - else: - file_to_download_info = model_file - break - - if file_to_download_info is None: + if file_to_download_meta is None: raise NotExistError('The file path: %s not exist in: %s' % - (file_path, model_id)) + (file_path, repo_id)) # we need to download again - url_to_download = get_file_download_url(model_id, file_path, revision) - temp_file_name = next(tempfile._get_candidate_names()) + if repo_type == REPO_TYPE_MODEL: + url_to_download = get_file_download_url(repo_id, file_path, revision) + elif repo_type == REPO_TYPE_DATASET: + url_to_download = _api.get_dataset_file_url( + file_name=file_to_download_meta['Path'], + dataset_name=name, + namespace=group_or_owner, + revision=revision) + return download_file(url_to_download, file_to_download_meta, + temporary_cache_dir, cache, headers, cookies) + + +def create_temporary_directory_and_cache(model_id: str, + local_dir: str = None, + cache_dir: str = None, + repo_type: str = REPO_TYPE_MODEL): + if repo_type == REPO_TYPE_MODEL: + default_cache_root = get_model_cache_root() + elif repo_type == REPO_TYPE_DATASET: + default_cache_root = get_dataset_cache_root() - if MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB * 1000 * 1000 < file_to_download_info[ - 'Size'] and MODELSCOPE_DOWNLOAD_PARALLELS > 1: - parallel_download( - url_to_download, - temporary_cache_dir, - temp_file_name, - headers=headers, - cookies=None if cookies is None else cookies.get_dict(), - file_size=file_to_download_info['Size']) + group_or_owner, name = model_id_to_group_owner_name(model_id) + if local_dir is not None: + temporary_cache_dir = os.path.join(local_dir, TEMPORARY_FOLDER_NAME) + cache = ModelFileSystemCache(local_dir) else: - http_get_file( - url_to_download, - temporary_cache_dir, - temp_file_name, - headers=headers, - cookies=None if cookies is None else cookies.get_dict()) + if cache_dir is None: + cache_dir = default_cache_root + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + temporary_cache_dir = os.path.join(cache_dir, TEMPORARY_FOLDER_NAME, + group_or_owner, name) + name = name.replace('.', '___') + cache = ModelFileSystemCache(cache_dir, group_or_owner, name) - temp_file_path = os.path.join(temporary_cache_dir, temp_file_name) - # for download with commit we can't get Sha256 - if file_to_download_info[FILE_HASH] is not None: - file_integrity_validation(temp_file_path, - file_to_download_info[FILE_HASH]) - return cache.put_file(file_to_download_info, - os.path.join(temporary_cache_dir, temp_file_name)) + os.makedirs(temporary_cache_dir, exist_ok=True) + return temporary_cache_dir, cache def get_file_download_url(model_id: str, file_path: str, revision: str): @@ -179,6 +319,8 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): Returns: str: The file url. """ + file_path = urllib.parse.quote_plus(file_path) + revision = urllib.parse.quote_plus(revision) download_url_template = '{endpoint}/api/v1/models/{model_id}/repo?Revision={revision}&FilePath={file_path}' return download_url_template.format( endpoint=get_endpoint(), @@ -190,18 +332,27 @@ def get_file_download_url(model_id: str, file_path: str, revision: str): def download_part_with_retry(params): # unpack parameters - progress, start, end, url, file_name, cookies, headers = params + model_file_path, progress, start, end, url, file_name, cookies, headers = params get_headers = {} if headers is None else copy.deepcopy(headers) - get_headers['Range'] = 'bytes=%s-%s' % (start, end) get_headers['X-Request-ID'] = str(uuid.uuid4().hex) retry = Retry( total=API_FILE_DOWNLOAD_RETRY_TIMES, backoff_factor=1, allowed_methods=['GET']) + part_file_name = model_file_path + '_%s_%s' % (start, end) while True: try: - with open(file_name, 'rb+') as f: - f.seek(start) + partial_length = 0 + if os.path.exists( + part_file_name): # download partial, continue download + with open(part_file_name, 'rb') as f: + partial_length = f.seek(0, io.SEEK_END) + progress.update(partial_length) + download_start = start + partial_length + if download_start > end: + break # this part is download completed. + get_headers['Range'] = 'bytes=%s-%s' % (download_start, end) + with open(part_file_name, 'ab+') as f: r = requests.get( url, stream=True, @@ -212,12 +363,12 @@ def download_part_with_retry(params): chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE): if chunk: # filter out keep-alive new chunks f.write(chunk) - progress.update(end - start) + progress.update(len(chunk)) break except (Exception) as e: # no matter what exception, we will retry. retry = retry.increment('GET', url, error=e) - logger.warning('Download file from: %s to: %s failed, will retry' % - (start, end)) + logger.warning('Downloading: %s failed, reason: %s will retry' % + (model_file_path, e)) retry.sleep() @@ -230,36 +381,130 @@ def parallel_download( file_size: int = None, ): # create temp file - temp_file_manager = partial( - tempfile.NamedTemporaryFile, mode='wb', dir=local_dir, delete=False) - with temp_file_manager() as temp_file: - progress = tqdm( + with tqdm( unit='B', unit_scale=True, unit_divisor=1024, total=file_size, initial=0, - desc='Downloading', - ) + desc='Downloading [' + file_name + ']', + leave=True, + ) as progress: PART_SIZE = 160 * 1024 * 1024 # every part is 160M tasks = [] + file_path = os.path.join(local_dir, file_name) + os.makedirs(os.path.dirname(file_path), exist_ok=True) for idx in range(int(file_size / PART_SIZE)): start = idx * PART_SIZE end = (idx + 1) * PART_SIZE - 1 - tasks.append( - (progress, start, end, url, temp_file.name, cookies, headers)) + tasks.append((file_path, progress, start, end, url, file_name, + cookies, headers)) if end + 1 < file_size: - tasks.append((progress, end + 1, file_size - 1, url, - temp_file.name, cookies, headers)) + tasks.append((file_path, progress, end + 1, file_size - 1, url, + file_name, cookies, headers)) parallels = MODELSCOPE_DOWNLOAD_PARALLELS if MODELSCOPE_DOWNLOAD_PARALLELS <= 4 else 4 + # download every part with ThreadPoolExecutor( max_workers=parallels, thread_name_prefix='download') as executor: list(executor.map(download_part_with_retry, tasks)) - progress.close() + # merge parts. + with open(os.path.join(local_dir, file_name), 'wb') as output_file: + for task in tasks: + part_file_name = task[0] + '_%s_%s' % (task[2], task[3]) + with open(part_file_name, 'rb') as part_file: + output_file.write(part_file.read()) + os.remove(part_file_name) - os.replace(temp_file.name, os.path.join(local_dir, file_name)) + +def http_get_model_file( + url: str, + local_dir: str, + file_name: str, + file_size: int, + cookies: CookieJar, + headers: Optional[Dict[str, str]] = None, +): + """Download remote file, will retry 5 times before giving up on errors. + + Args: + url(str): + actual download url of the file + local_dir(str): + local directory where the downloaded file stores + file_name(str): + name of the file stored in `local_dir` + file_size(int): + The file size. + cookies(CookieJar): + cookies used to authentication the user, which is used for downloading private repos + headers(Dict[str, str], optional): + http headers to carry necessary info when requesting the remote file + + Raises: + FileDownloadError: File download failed. + + """ + get_headers = {} if headers is None else copy.deepcopy(headers) + get_headers['X-Request-ID'] = str(uuid.uuid4().hex) + temp_file_path = os.path.join(local_dir, file_name) + os.makedirs(os.path.dirname(temp_file_path), exist_ok=True) + logger.debug('downloading %s to %s', url, temp_file_path) + # retry sleep 0.5s, 1s, 2s, 4s + retry = Retry( + total=API_FILE_DOWNLOAD_RETRY_TIMES, + backoff_factor=1, + allowed_methods=['GET']) + while True: + try: + with tqdm( + unit='B', + unit_scale=True, + unit_divisor=1024, + total=file_size if file_size > 0 else 1, + initial=0, + desc='Downloading [' + file_name + ']', + leave=True, + ) as progress: + if file_size == 0: + # Avoid empty file server request + with open(temp_file_path, 'w+'): + progress.update(1) + break + # Determine the length of any existing partial download + partial_length = 0 + # download partial, continue download + if os.path.exists(temp_file_path): + with open(temp_file_path, 'rb') as f: + partial_length = f.seek(0, io.SEEK_END) + progress.update(partial_length) + + # Check if download is complete + if partial_length >= file_size: + break + # closed range[], from 0. + get_headers['Range'] = 'bytes=%s-%s' % (partial_length, + file_size - 1) + with open(temp_file_path, 'ab+') as f: + r = requests.get( + url, + stream=True, + headers=get_headers, + cookies=cookies, + timeout=API_FILE_DOWNLOAD_TIMEOUT) + r.raise_for_status() + for chunk in r.iter_content( + chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE): + if chunk: # filter out keep-alive new chunks + progress.update(len(chunk)) + f.write(chunk) + break + except (Exception) as e: # no matter what happen, we will retry. + retry = retry.increment('GET', url, error=e) + retry.sleep() + + logger.debug('storing %s in cache at %s', url, local_dir) def http_get_file( @@ -319,7 +564,7 @@ def http_get_file( unit_divisor=1024, total=total, initial=downloaded_size, - desc='Downloading', + desc='Downloading [' + file_name + ']', ) for chunk in r.iter_content( chunk_size=API_FILE_DOWNLOAD_CHUNK_SIZE): @@ -342,3 +587,31 @@ def http_get_file( logger.error(msg) raise FileDownloadError(msg) os.replace(temp_file.name, os.path.join(local_dir, file_name)) + + +def download_file(url, file_meta, temporary_cache_dir, cache, headers, + cookies): + if MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB * 1000 * 1000 < file_meta[ + 'Size'] and MODELSCOPE_DOWNLOAD_PARALLELS > 1: # parallel download large file. + parallel_download( + url, + temporary_cache_dir, + file_meta['Path'], + headers=headers, + cookies=None if cookies is None else cookies.get_dict(), + file_size=file_meta['Size']) + else: + http_get_model_file( + url, + temporary_cache_dir, + file_meta['Path'], + file_size=file_meta['Size'], + headers=headers, + cookies=cookies) + + # check file integrity + temp_file = os.path.join(temporary_cache_dir, file_meta['Path']) + if FILE_HASH in file_meta: + file_integrity_validation(temp_file, file_meta[FILE_HASH]) + # put file into to cache + return cache.put_file(file_meta, temp_file) diff --git a/modelscope/hub/git.py b/modelscope/hub/git.py index b0fae148b..144d9d695 100644 --- a/modelscope/hub/git.py +++ b/modelscope/hub/git.py @@ -45,8 +45,9 @@ def _run_git_command(self, *args) -> subprocess.CompletedProcess: logger.debug(' '.join(args)) git_env = os.environ.copy() git_env['GIT_TERMINAL_PROMPT'] = '0' + command = [self.git_path, *args] response = subprocess.run( - [self.git_path, *args], + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=git_env, @@ -55,10 +56,18 @@ def _run_git_command(self, *args) -> subprocess.CompletedProcess: response.check_returncode() return response except subprocess.CalledProcessError as error: - logger.error('There are error run git command.') - raise GitError( - 'stdout: %s, stderr: %s' % - (response.stdout.decode('utf8'), error.stderr.decode('utf8'))) + std_out = response.stdout.decode('utf8') + std_err = error.stderr.decode('utf8') + if 'nothing to commit' in std_out: + logger.info( + 'Nothing to commit, your local repo is upto date with remote' + ) + return response + else: + logger.error( + 'Running git command: %s failed \n stdout: %s \n stderr: %s' + % (command, std_out, std_err)) + raise GitError(std_err) def config_auth_token(self, repo_dir, auth_token): url = self.get_repo_remote_url(repo_dir) diff --git a/modelscope/hub/snapshot_download.py b/modelscope/hub/snapshot_download.py index 82313b15c..015cadbd3 100644 --- a/modelscope/hub/snapshot_download.py +++ b/modelscope/hub/snapshot_download.py @@ -1,33 +1,46 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import fnmatch import os import re -import tempfile +import uuid +from concurrent.futures import ThreadPoolExecutor from http.cookiejar import CookieJar from pathlib import Path from typing import Dict, List, Optional, Union +from tqdm.auto import tqdm + from modelscope.hub.api import HubApi, ModelScopeConfig -from modelscope.utils.constant import DEFAULT_MODEL_REVISION +from modelscope.hub.errors import InvalidParameter +from modelscope.hub.utils.caching import ModelFileSystemCache +from modelscope.hub.utils.utils import (get_model_masked_directory, + model_id_to_group_owner_name) +from modelscope.utils.constant import (DEFAULT_DATASET_REVISION, + DEFAULT_MODEL_REVISION, + REPO_TYPE_DATASET, REPO_TYPE_MODEL, + REPO_TYPE_SUPPORT) from modelscope.utils.logger import get_logger -from .constants import (FILE_HASH, MODELSCOPE_DOWNLOAD_PARALLELS, - MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB) -from .file_download import (get_file_download_url, http_get_file, - parallel_download) -from .utils.caching import ModelFileSystemCache -from .utils.utils import (file_integrity_validation, get_cache_dir, - model_id_to_group_owner_name) +from .file_download import (create_temporary_directory_and_cache, + download_file, get_file_download_url) logger = get_logger() -def snapshot_download(model_id: str, - revision: Optional[str] = DEFAULT_MODEL_REVISION, - cache_dir: Union[str, Path, None] = None, - user_agent: Optional[Union[Dict, str]] = None, - local_files_only: Optional[bool] = False, - cookies: Optional[CookieJar] = None, - ignore_file_pattern: List = None) -> str: +def snapshot_download( + model_id: str, + revision: Optional[str] = DEFAULT_MODEL_REVISION, + cache_dir: Union[str, Path, None] = None, + user_agent: Optional[Union[Dict, str]] = None, + local_files_only: Optional[bool] = False, + cookies: Optional[CookieJar] = None, + ignore_file_pattern: Optional[Union[str, List[str]]] = None, + allow_file_pattern: Optional[Union[str, List[str]]] = None, + local_dir: Optional[str] = None, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + max_workers: int = 8, +) -> str: """Download all files of a repo. Downloads a whole snapshot of a repo's files at the specified revision. This is useful when you want all files from a repo, because you don't know which @@ -41,13 +54,101 @@ def snapshot_download(model_id: str, model_id (str): A user or an organization name and a repo name separated by a `/`. revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a commit hash. NOTE: currently only branch and tag name is supported - cache_dir (str, Path, optional): Path to the folder where cached files are stored. + cache_dir (str, Path, optional): Path to the folder where cached files are stored, model will + be save as cache_dir/model_id/THE_MODEL_FILES. + user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string. + local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the + local cached file if it exists. + cookies (CookieJar, optional): The cookie of the request, default None. + ignore_file_pattern (`str` or `List`, *optional*, default to `None`): + Any file pattern to be ignored in downloading, like exact file names or file extensions. + allow_file_pattern (`str` or `List`, *optional*, default to `None`): + Any file pattern to be downloading, like exact file names or file extensions. + local_dir (str, optional): Specific local directory path to which the file will be downloaded. + allow_patterns (`str` or `List`, *optional*, default to `None`): + If provided, only files matching at least one pattern are downloaded, priority over allow_file_pattern. + For hugging-face compatibility. + ignore_patterns (`str` or `List`, *optional*, default to `None`): + If provided, files matching any of the patterns are not downloaded, priority over ignore_file_pattern. + For hugging-face compatibility. + max_workers (`int`): The maximum number of workers to download files, default 8. + Raises: + ValueError: the value details. + + Returns: + str: Local folder path (string) of repo snapshot + + Note: + Raises the following errors: + - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError) + if `use_auth_token=True` and the token cannot be found. + - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if + ETag cannot be determined. + - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) + if some parameter value is invalid + """ + return _snapshot_download( + model_id, + repo_type=REPO_TYPE_MODEL, + revision=revision, + cache_dir=cache_dir, + user_agent=user_agent, + local_files_only=local_files_only, + cookies=cookies, + ignore_file_pattern=ignore_file_pattern, + allow_file_pattern=allow_file_pattern, + local_dir=local_dir, + ignore_patterns=ignore_patterns, + allow_patterns=allow_patterns, + max_workers=max_workers) + + +def dataset_snapshot_download( + dataset_id: str, + revision: Optional[str] = DEFAULT_DATASET_REVISION, + cache_dir: Union[str, Path, None] = None, + local_dir: Optional[str] = None, + user_agent: Optional[Union[Dict, str]] = None, + local_files_only: Optional[bool] = False, + cookies: Optional[CookieJar] = None, + ignore_file_pattern: Optional[Union[str, List[str]]] = None, + allow_file_pattern: Optional[Union[str, List[str]]] = None, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + max_workers: int = 8, +) -> str: + """Download raw files of a dataset. + Downloads all files at the specified revision. This + is useful when you want all files from a dataset, because you don't know which + ones you will need a priori. All files are nested inside a folder in order + to keep their actual filename relative to that folder. + + An alternative would be to just clone a dataset but this would require that the + user always has git and git-lfs installed, and properly configured. + + Args: + dataset_id (str): A user or an organization name and a dataset name separated by a `/`. + revision (str, optional): An optional Git revision id which can be a branch name, a tag, or a + commit hash. NOTE: currently only branch and tag name is supported + cache_dir (str, Path, optional): Path to the folder where cached files are stored, dataset will + be save as cache_dir/dataset_id/THE_DATASET_FILES. + local_dir (str, optional): Specific local directory path to which the file will be downloaded. user_agent (str, dict, optional): The user-agent info in the form of a dictionary or a string. local_files_only (bool, optional): If `True`, avoid downloading the file and return the path to the local cached file if it exists. cookies (CookieJar, optional): The cookie of the request, default None. ignore_file_pattern (`str` or `List`, *optional*, default to `None`): Any file pattern to be ignored in downloading, like exact file names or file extensions. + Use regression is deprecated. + allow_file_pattern (`str` or `List`, *optional*, default to `None`): + Any file pattern to be downloading, like exact file names or file extensions. + allow_patterns (`str` or `List`, *optional*, default to `None`): + If provided, only files matching at least one pattern are downloaded, priority over allow_file_pattern. + For hugging-face compatibility. + ignore_patterns (`str` or `List`, *optional*, default to `None`): + If provided, files matching any of the patterns are not downloaded, priority over ignore_file_pattern. + For hugging-face compatibility. + max_workers (`int`): The maximum number of workers to download files, default 8. Raises: ValueError: the value details. @@ -63,22 +164,54 @@ def snapshot_download(model_id: str, - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError) if some parameter value is invalid """ + return _snapshot_download( + dataset_id, + repo_type=REPO_TYPE_DATASET, + revision=revision, + cache_dir=cache_dir, + user_agent=user_agent, + local_files_only=local_files_only, + cookies=cookies, + ignore_file_pattern=ignore_file_pattern, + allow_file_pattern=allow_file_pattern, + local_dir=local_dir, + ignore_patterns=ignore_patterns, + allow_patterns=allow_patterns, + max_workers=max_workers) - if cache_dir is None: - cache_dir = get_cache_dir() - if isinstance(cache_dir, Path): - cache_dir = str(cache_dir) - temporary_cache_dir = os.path.join(cache_dir, 'temp') - os.makedirs(temporary_cache_dir, exist_ok=True) - group_or_owner, name = model_id_to_group_owner_name(model_id) +def _snapshot_download( + repo_id: str, + *, + repo_type: Optional[str] = None, + revision: Optional[str] = DEFAULT_MODEL_REVISION, + cache_dir: Union[str, Path, None] = None, + user_agent: Optional[Union[Dict, str]] = None, + local_files_only: Optional[bool] = False, + cookies: Optional[CookieJar] = None, + ignore_file_pattern: Optional[Union[str, List[str]]] = None, + allow_file_pattern: Optional[Union[str, List[str]]] = None, + local_dir: Optional[str] = None, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + max_workers: int = 8, +): + if not repo_type: + repo_type = REPO_TYPE_MODEL + if repo_type not in REPO_TYPE_SUPPORT: + raise InvalidParameter('Invalid repo type: %s, only support: %s' % + (repo_type, REPO_TYPE_SUPPORT)) - cache = ModelFileSystemCache(cache_dir, group_or_owner, name) + temporary_cache_dir, cache = create_temporary_directory_and_cache( + repo_id, local_dir=local_dir, cache_dir=cache_dir, repo_type=repo_type) + system_cache = cache_dir if cache_dir is not None else os.getenv( + 'MODELSCOPE_CACHE', + Path.home().joinpath('.cache', 'modelscope')) if local_files_only: if len(cache.cached_files) == 0: raise ValueError( 'Cannot find the requested files in the cached path and outgoing' - ' traffic has been disabled. To enable model look-ups and downloads' + ' traffic has been disabled. To enable look-ups and downloads' " online, set 'local_files_only' to False.") logger.warning('We can not confirm the cached file is for revision: %s' % revision) @@ -88,76 +221,281 @@ def snapshot_download(model_id: str, # make headers headers = { 'user-agent': - ModelScopeConfig.get_user_agent(user_agent=user_agent, ) + ModelScopeConfig.get_user_agent(user_agent=user_agent, ), } + if 'CI_TEST' not in os.environ: + # To count the download statistics, to add the snapshot-identifier as a header. + headers['snapshot-identifier'] = str(uuid.uuid4()) _api = HubApi() if cookies is None: cookies = ModelScopeConfig.get_cookies() - revision = _api.get_valid_revision( - model_id, revision=revision, cookies=cookies) + repo_files = [] + if repo_type == REPO_TYPE_MODEL: + directory = os.path.abspath( + local_dir) if local_dir is not None else os.path.join( + system_cache, 'hub', repo_id) + print(f'Downloading Model to directory: {directory}') + revision_detail = _api.get_valid_revision_detail( + repo_id, revision=revision, cookies=cookies) + revision = revision_detail['Revision'] - snapshot_header = headers if 'CI_TEST' in os.environ else { - **headers, - **{ - 'Snapshot': 'True' + snapshot_header = headers if 'CI_TEST' in os.environ else { + **headers, + **{ + 'Snapshot': 'True' + } } - } - model_files = _api.get_model_files( - model_id=model_id, + if cache.cached_model_revision is not None: + snapshot_header[ + 'cached_model_revision'] = cache.cached_model_revision + + repo_files = _api.get_model_files( + model_id=repo_id, + revision=revision, + recursive=True, + use_cookies=False if cookies is None else cookies, + headers=snapshot_header, + ) + _download_file_lists( + repo_files, + cache, + temporary_cache_dir, + repo_id, + _api, + None, + None, + headers, + repo_type=repo_type, + revision=revision, + cookies=cookies, + ignore_file_pattern=ignore_file_pattern, + allow_file_pattern=allow_file_pattern, + ignore_patterns=ignore_patterns, + allow_patterns=allow_patterns, + max_workers=max_workers) + if '.' in repo_id: + masked_directory = get_model_masked_directory( + directory, repo_id) + if os.path.exists(directory): + logger.info( + 'Target directory already exists, skipping creation.') + else: + logger.info(f'Creating symbolic link [{directory}].') + try: + os.symlink( + os.path.abspath(masked_directory), directory) + except OSError: + logger.warning( + f'Failed to create symbolic link {directory}.') + + elif repo_type == REPO_TYPE_DATASET: + directory = os.path.abspath( + local_dir) if local_dir else os.path.join( + system_cache, 'datasets', repo_id) + print(f'Downloading Dataset to directory: {directory}') + + group_or_owner, name = model_id_to_group_owner_name(repo_id) + revision_detail = revision or DEFAULT_DATASET_REVISION + + logger.info('Fetching dataset repo file list...') + repo_files = fetch_repo_files(_api, name, group_or_owner, + revision_detail) + + if repo_files is None: + logger.error( + f'Failed to retrieve file list for dataset: {repo_id}') + return None + + _download_file_lists( + repo_files, + cache, + temporary_cache_dir, + repo_id, + _api, + name, + group_or_owner, + headers, + repo_type=repo_type, + revision=revision, + cookies=cookies, + ignore_file_pattern=ignore_file_pattern, + allow_file_pattern=allow_file_pattern, + ignore_patterns=ignore_patterns, + allow_patterns=allow_patterns, + max_workers=max_workers) + + cache.save_model_version(revision_info=revision_detail) + cache_root_path = cache.get_root_location() + return cache_root_path + + +def fetch_repo_files(_api, name, group_or_owner, revision): + page_number = 1 + page_size = 150 + repo_files = [] + + while True: + files_list_tree = _api.list_repo_tree( + dataset_name=name, + namespace=group_or_owner, revision=revision, + root_path='/', recursive=True, - use_cookies=False if cookies is None else cookies, - headers=snapshot_header, - ) - - if ignore_file_pattern is None: - ignore_file_pattern = [] - if isinstance(ignore_file_pattern, str): - ignore_file_pattern = [ignore_file_pattern] - - with tempfile.TemporaryDirectory( - dir=temporary_cache_dir) as temp_cache_dir: - for model_file in model_files: - if model_file['Type'] == 'tree' or \ - any([re.search(pattern, model_file['Name']) is not None for pattern in ignore_file_pattern]): + page_number=page_number, + page_size=page_size) + + if not ('Code' in files_list_tree and files_list_tree['Code'] == 200): + logger.error(f'Get dataset file list failed, request_id: \ + {files_list_tree["RequestId"]}, message: {files_list_tree["Message"]}' + ) + return None + + cur_repo_files = files_list_tree['Data']['Files'] + repo_files.extend(cur_repo_files) + + if len(cur_repo_files) < page_size: + break + + page_number += 1 + + return repo_files + + +def _is_valid_regex(pattern: str): + try: + re.compile(pattern) + return True + except BaseException: + return False + + +def _normalize_patterns(patterns: Union[str, List[str]]): + if isinstance(patterns, str): + patterns = [patterns] + if patterns is not None: + patterns = [ + item if not item.endswith('/') else item + '*' for item in patterns + ] + return patterns + + +def _get_valid_regex_pattern(patterns: List[str]): + if patterns is not None: + regex_patterns = [] + for item in patterns: + if _is_valid_regex(item): + regex_patterns.append(item) + return regex_patterns + else: + return None + + +def thread_download(func, iterable, max_workers, **kwargs): + # Create a tqdm progress bar with the total number of files to fetch + with tqdm( + total=len(iterable), + desc=f'Fetching {len(iterable)} files') as pbar: + # Define a wrapper function to update the progress bar + def progress_wrapper(*args, **kwargs): + result = func(*args, **kwargs) + pbar.update(1) + return result + + with ThreadPoolExecutor(max_workers=max_workers) as executor: + executor.map(progress_wrapper, iterable) + + +def _download_file_lists( + repo_files: List[str], + cache: ModelFileSystemCache, + temporary_cache_dir: str, + repo_id: str, + api: HubApi, + name: str, + group_or_owner: str, + headers, + repo_type: Optional[str] = None, + revision: Optional[str] = DEFAULT_MODEL_REVISION, + cookies: Optional[CookieJar] = None, + ignore_file_pattern: Optional[Union[str, List[str]]] = None, + allow_file_pattern: Optional[Union[str, List[str]]] = None, + allow_patterns: Optional[Union[List[str], str]] = None, + ignore_patterns: Optional[Union[List[str], str]] = None, + max_workers: int = 8): + ignore_patterns = _normalize_patterns(ignore_patterns) + allow_patterns = _normalize_patterns(allow_patterns) + ignore_file_pattern = _normalize_patterns(ignore_file_pattern) + allow_file_pattern = _normalize_patterns(allow_file_pattern) + # to compatible regex usage. + ignore_regex_pattern = _get_valid_regex_pattern(ignore_file_pattern) + + filtered_repo_files = [] + for repo_file in repo_files: + if repo_file['Type'] == 'tree': + continue + try: + # processing patterns + if ignore_patterns and any([ + fnmatch.fnmatch(repo_file['Path'], pattern) + for pattern in ignore_patterns + ]): + continue + + if ignore_file_pattern and any([ + fnmatch.fnmatch(repo_file['Path'], pattern) + for pattern in ignore_file_pattern + ]): + continue + + if ignore_regex_pattern and any([ + re.search(pattern, repo_file['Name']) is not None + for pattern in ignore_regex_pattern + ]): # noqa E501 + continue + + if allow_patterns is not None and allow_patterns: + if not any( + fnmatch.fnmatch(repo_file['Path'], pattern) + for pattern in allow_patterns): continue - # check model_file is exist in cache, if existed, skip download, otherwise download - if cache.exists(model_file): - file_name = os.path.basename(model_file['Name']) - logger.debug( - f'File {file_name} already in cache, skip downloading!' - ) + + if allow_file_pattern is not None and allow_file_pattern: + if not any( + fnmatch.fnmatch(repo_file['Path'], pattern) + for pattern in allow_file_pattern): continue + # check model_file is exist in cache, if existed, skip download + if cache.exists(repo_file): + file_name = os.path.basename(repo_file['Name']) + logger.debug( + f'File {file_name} already in cache with identical hash, skip downloading!' + ) + continue + except Exception as e: + logger.warning('The file pattern is invalid : %s' % e) + else: + filtered_repo_files.append(repo_file) - # get download url - url = get_file_download_url( - model_id=model_id, - file_path=model_file['Path'], - revision=revision) - - if MODELSCOPE_PARALLEL_DOWNLOAD_THRESHOLD_MB * 1000 * 1000 < model_file[ - 'Size'] and MODELSCOPE_DOWNLOAD_PARALLELS > 1: - parallel_download( - url, - temp_cache_dir, - model_file['Name'], - headers=headers, - cookies=None - if cookies is None else cookies.get_dict(), - file_size=model_file['Size']) - else: - http_get_file( - url, - temp_cache_dir, - model_file['Name'], - headers=headers, - cookies=cookies) - - # check file integrity - temp_file = os.path.join(temp_cache_dir, model_file['Name']) - if FILE_HASH in model_file: - file_integrity_validation(temp_file, model_file[FILE_HASH]) - # put file to cache - cache.put_file(model_file, temp_file) - - return os.path.join(cache.get_root_location()) + def _download_single_file(repo_file): + if repo_type == REPO_TYPE_MODEL: + url = get_file_download_url( + model_id=repo_id, + file_path=repo_file['Path'], + revision=revision) + elif repo_type == REPO_TYPE_DATASET: + url = api.get_dataset_file_url( + file_name=repo_file['Path'], + dataset_name=name, + namespace=group_or_owner, + revision=revision) + else: + raise InvalidParameter( + f'Invalid repo type: {repo_type}, supported types: {REPO_TYPE_SUPPORT}' + ) + download_file(url, repo_file, temporary_cache_dir, cache, headers, + cookies) + + if len(filtered_repo_files) > 0: + thread_download(_download_single_file, filtered_repo_files, + max_workers) + logger.info(f"Download {repo_type} '{repo_id}' successfully.") diff --git a/modelscope/hub/utils/caching.py b/modelscope/hub/utils/caching.py index f92aaaf46..e1dcf83bc 100644 --- a/modelscope/hub/utils/caching.py +++ b/modelscope/hub/utils/caching.py @@ -5,17 +5,26 @@ import pickle import tempfile from shutil import move, rmtree +from typing import Dict -from modelscope.hub.constants import MODEL_META_FILE_NAME, MODEL_META_MODEL_ID +from modelscope.hub.constants import ( # noqa + FILE_HASH, MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION) +from modelscope.hub.utils.utils import compute_hash from modelscope.utils.logger import get_logger logger = get_logger() + +enable_default_hash_validation = \ + os.getenv(MODELSCOPE_ENABLE_DEFAULT_HASH_VALIDATION, 'False').strip().lower() == 'true' """Implements caching functionality, used internally only """ class FileSystemCache(object): KEY_FILE_NAME = '.msc' + MODEL_META_FILE_NAME = '.mdl' + MODEL_META_MODEL_ID = 'id' + MODEL_VERSION_FILE_NAME = '.mv' """Local file cache. """ @@ -133,24 +142,47 @@ def __init__(self, cache_root, owner=None, name=None): self.load_model_meta() else: super().__init__(os.path.join(cache_root, owner, name)) - self.model_meta = {MODEL_META_MODEL_ID: '%s/%s' % (owner, name)} + self.model_meta = { + FileSystemCache.MODEL_META_MODEL_ID: '%s/%s' % (owner, name) + } self.save_model_meta() + self.cached_model_revision = self.load_model_version() def load_model_meta(self): meta_file_path = os.path.join(self.cache_root_location, - MODEL_META_FILE_NAME) + FileSystemCache.MODEL_META_FILE_NAME) if os.path.exists(meta_file_path): with open(meta_file_path, 'rb') as f: self.model_meta = pickle.load(f) else: - self.model_meta = {MODEL_META_MODEL_ID: 'unknown'} + self.model_meta = {FileSystemCache.MODEL_META_MODEL_ID: 'unknown'} + + def load_model_version(self): + model_version_file_path = os.path.join( + self.cache_root_location, FileSystemCache.MODEL_VERSION_FILE_NAME) + if os.path.exists(model_version_file_path): + with open(model_version_file_path, 'r') as f: + return f.read().strip() + else: + return None + + def save_model_version(self, revision_info: Dict): + model_version_file_path = os.path.join( + self.cache_root_location, FileSystemCache.MODEL_VERSION_FILE_NAME) + with open(model_version_file_path, 'w') as f: + if isinstance(revision_info, dict): + version_info_str = 'Revision:%s,CreatedAt:%s' % ( + revision_info['Revision'], revision_info['CreatedAt']) + f.write(version_info_str) + else: + f.write(revision_info) def get_model_id(self): - return self.model_meta[MODEL_META_MODEL_ID] + return self.model_meta[FileSystemCache.MODEL_META_MODEL_ID] def save_model_meta(self): meta_file_path = os.path.join(self.cache_root_location, - MODEL_META_FILE_NAME) + FileSystemCache.MODEL_META_FILE_NAME) with open(meta_file_path, 'wb') as f: pickle.dump(self.model_meta, f) @@ -226,26 +258,40 @@ def __get_cache_key(self, model_file_info): return cache_key def exists(self, model_file_info): - """Check the file is cached or not. + """Check the file is cached or not. Note existence check will also cover digest check Args: model_file_info (CachedFileInfo): The cached file info Returns: - bool: If exists return True otherwise False + bool: If exists and has the same hash, return True otherwise False """ key = self.__get_cache_key(model_file_info) is_exists = False + file_path = key['Path'] + cache_file_path = os.path.join(self.cache_root_location, + model_file_info['Path']) for cached_key in self.cached_files: - if cached_key['Path'] == key['Path'] and ( + if cached_key['Path'] == file_path and ( cached_key['Revision'].startswith(key['Revision']) or key['Revision'].startswith(cached_key['Revision'])): - is_exists = True - break - file_path = os.path.join(self.cache_root_location, - model_file_info['Path']) + expected_hash = model_file_info[FILE_HASH] + if expected_hash is not None and os.path.exists( + cache_file_path): + # compute hash only when enabled, otherwise just meet expectation by default + if enable_default_hash_validation: + cache_file_sha256 = compute_hash(cache_file_path) + else: + cache_file_sha256 = expected_hash + if expected_hash == cache_file_sha256: + is_exists = True + break + else: + logger.info( + f'File [{file_path}] exists in cache but with a mismatched hash, will re-download.' + ) if is_exists: - if os.path.exists(file_path): + if os.path.exists(cache_file_path): return True else: self.remove_key( diff --git a/modelscope/hub/utils/utils.py b/modelscope/hub/utils/utils.py index 31e6e72c0..bb38f26ac 100644 --- a/modelscope/hub/utils/utils.py +++ b/modelscope/hub/utils/utils.py @@ -3,6 +3,7 @@ import hashlib import os from datetime import datetime +from pathlib import Path from typing import Optional import requests @@ -12,7 +13,7 @@ MODEL_ID_SEPARATOR, MODELSCOPE_SDK_DEBUG, MODELSCOPE_URL_SCHEME) from modelscope.hub.errors import FileIntegrityError -from modelscope.utils.file_utils import get_default_cache_dir +from modelscope.utils.file_utils import get_default_modelscope_cache_dir from modelscope.utils.logger import get_logger logger = get_logger() @@ -28,17 +29,48 @@ def model_id_to_group_owner_name(model_id): return group_or_owner, name +# during model download, the '.' would be converted to '___' to produce +# actual physical (masked) directory for storage +def get_model_masked_directory(directory, model_id): + parts = directory.rsplit('/', 2) + # this is the actual directory the model files are located. + masked_directory = os.path.join(parts[0], model_id.replace('.', '___')) + return masked_directory + + +def convert_readable_size(size_bytes): + import math + if size_bytes == 0: + return '0B' + size_name = ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB', 'YB') + i = int(math.floor(math.log(size_bytes, 1024))) + p = math.pow(1024, i) + s = round(size_bytes / p, 2) + return f'{s} {size_name[i]}' + + +def get_folder_size(folder_path): + total_size = 0 + for path in Path(folder_path).rglob('*'): + if path.is_file(): + total_size += path.stat().st_size + return total_size + + +# return a readable string that describe size of for a given folder (MB, GB etc.) +def get_readable_folder_size(folder_path) -> str: + return convert_readable_size(get_folder_size(folder_path=folder_path)) + + def get_cache_dir(model_id: Optional[str] = None): """cache dir precedence: function parameter > environment > ~/.cache/modelscope/hub - Args: model_id (str, optional): The model id. - Returns: str: the model_id dir if model_id not None, otherwise cache root dir. """ - default_cache_dir = get_default_cache_dir() + default_cache_dir = Path.home().joinpath('.cache', 'modelscope') base_path = os.getenv('MODELSCOPE_CACHE', os.path.join(default_cache_dir, 'hub')) return base_path if model_id is None else os.path.join( @@ -89,6 +121,7 @@ def file_integrity_validation(file_path, expected_sha256): file_sha256 = compute_hash(file_path) if not file_sha256 == expected_sha256: os.remove(file_path) - msg = 'File %s integrity check failed, the download may be incomplete, please try again.' % file_path + msg = 'File %s integrity check failed, expected sha256 signature is %s, actual is %s, the download may be incomplete, please try again.' % ( # noqa E501 + file_path, expected_sha256, file_sha256) logger.error(msg) raise FileIntegrityError(msg) diff --git a/modelscope/metainfo.py b/modelscope/metainfo.py index 87d5f3129..8166e004c 100644 --- a/modelscope/metainfo.py +++ b/modelscope/metainfo.py @@ -39,6 +39,7 @@ class Models(object): body_3d_keypoints_hdformer = 'hdformer' crowd_counting = 'HRNetCrowdCounting' face_2d_keypoints = 'face-2d-keypoints' + star_68ldk_detection = 'star-68ldk-detection' panoptic_segmentation = 'swinL-panoptic-segmentation' r50_panoptic_segmentation = 'r50-panoptic-segmentation' image_reid_person = 'passvitb' @@ -52,10 +53,13 @@ class Models(object): vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation' text_driven_segmentation = 'text-driven-segmentation' newcrfs_depth_estimation = 'newcrfs-depth-estimation' + omnidata_normal_estimation = 'omnidata-normal-estimation' panovit_layout_estimation = 'panovit-layout-estimation' unifuse_depth_estimation = 'unifuse-depth-estimation' s2net_depth_estimation = 's2net-depth-estimation' dro_resnet18_depth_estimation = 'dro-resnet18-depth-estimation' + raft_dense_optical_flow_estimation = 'raft-dense-optical-flow-estimation' + human_normal_estimation = 'human-normal-estimation' resnet50_bert = 'resnet50-bert' referring_video_object_segmentation = 'swinT-referring-video-object-segmentation' fer = 'fer' @@ -87,12 +91,15 @@ class Models(object): video_object_segmentation = 'video-object-segmentation' video_deinterlace = 'video-deinterlace' quadtree_attention_image_matching = 'quadtree-attention-image-matching' + loftr_image_local_feature_matching = 'loftr-image-local-feature-matching' + lightglue_image_matching = 'lightglue-image-matching' vision_middleware = 'vision-middleware' vidt = 'vidt' video_stabilization = 'video-stabilization' real_basicvsr = 'real-basicvsr' rcp_sceneflow_estimation = 'rcp-sceneflow-estimation' image_casmvs_depth_estimation = 'image-casmvs-depth-estimation' + image_geomvsnet_depth_estimation = 'image-geomvsnet-depth-estimation' vop_retrieval_model = 'vop-retrieval-model' vop_retrieval_model_se = 'vop-retrieval-model-se' ddcolor = 'ddcolor' @@ -127,6 +134,9 @@ class Models(object): human_image_generation = 'human-image-generation' image_view_transform = 'image-view-transform' image_control_3d_portrait = 'image-control-3d-portrait' + rife = 'rife' + anydoor = 'anydoor' + self_supervised_depth_completion = 'self-supervised-depth-completion' # nlp models bert = 'bert' @@ -183,6 +193,7 @@ class Models(object): # audio models sambert_hifigan = 'sambert-hifigan' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base' speech_dfsmn_ans = 'speech_dfsmn_ans' speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' speech_dfsmn_kws_char_farfield_iot = 'speech_dfsmn_kws_char_farfield_iot' @@ -195,9 +206,13 @@ class Models(object): generic_itn = 'generic-itn' generic_punc = 'generic-punc' generic_sv = 'generic-sv' + tdnn_sv = 'tdnn-sv' ecapa_tdnn_sv = 'ecapa-tdnn-sv' campplus_sv = 'cam++-sv' eres2net_sv = 'eres2net-sv' + eres2netv2_sv = 'eres2netv2-sv' + resnet_sv = 'resnet-sv' + res2net_sv = 'res2net-sv' eres2net_aug_sv = 'eres2net-aug-sv' scl_sd = 'scl-sd' scl_sd_xvector = 'scl-sd-xvector' @@ -205,7 +220,11 @@ class Models(object): eres2net_lre = 'eres2net-lre' cluster_backend = 'cluster-backend' rdino_tdnn_sv = 'rdino_ecapa-tdnn-sv' + sdpn_sv = 'sdpn_ecapa-sv' generic_lm = 'generic-lm' + audio_quantization = 'audio-quantization' + laura_codec = 'laura-codec' + funasr = 'funasr' # multi-modal models ofa = 'ofa' @@ -326,6 +345,7 @@ class Pipelines(object): tinymog_face_detection = 'manual-face-detection-tinymog' facial_expression_recognition = 'vgg19-facial-expression-recognition-fer' facial_landmark_confidence = 'manual-facial-landmark-confidence-flcm' + facial_68ldk_detection = 'facial-68ldk-detection' face_attribute_recognition = 'resnet34-face-attribute-recognition-fairface' retina_face_detection = 'resnet50-face-detection-retinaface' mog_face_detection = 'resnet101-face-detection-cvpr22papermogface' @@ -386,10 +406,13 @@ class Pipelines(object): language_guided_video_summarization = 'clip-it-video-summarization' image_semantic_segmentation = 'image-semantic-segmentation' image_depth_estimation = 'image-depth-estimation' + image_normal_estimation = 'image-normal-estimation' indoor_layout_estimation = 'indoor-layout-estimation' + image_local_feature_matching = 'image-local-feature-matching' video_depth_estimation = 'video-depth-estimation' panorama_depth_estimation = 'panorama-depth-estimation' panorama_depth_estimation_s2net = 'panorama-depth-estimation-s2net' + dense_optical_flow_estimation = 'dense-optical-flow-estimation' image_reid_person = 'passvitb-image-reid-person' image_inpainting = 'fft-inpainting' image_paintbyexample = 'stablediffusion-paintbyexample' @@ -416,6 +439,7 @@ class Pipelines(object): video_object_segmentation = 'video-object-segmentation' video_deinterlace = 'video-deinterlace' image_matching = 'image-matching' + image_matching_fast = 'image-matching-fast' video_stabilization = 'video-stabilization' video_super_resolution = 'realbasicvsr-video-super-resolution' pointcloud_sceneflow_estimation = 'pointcloud-sceneflow-estimation' @@ -447,6 +471,7 @@ class Pipelines(object): image_quality_assessment_degradation = 'image-quality-assessment-degradation' vision_efficient_tuning = 'vision-efficient-tuning' image_bts_depth_estimation = 'image-bts-depth-estimation' + image_depth_estimation_marigold = 'image-depth-estimation-marigold' pedestrian_attribute_recognition = 'resnet50_pedestrian-attribute-recognition_image' text_to_360panorama_image = 'text-to-360panorama-image' image_try_on = 'image-try-on' @@ -455,6 +480,11 @@ class Pipelines(object): human3d_animation = 'human3d-animation' image_view_transform = 'image-view-transform' image_control_3d_portrait = 'image-control-3d-portrait' + rife_video_frame_interpolation = 'rife-video-frame-interpolation' + anydoor = 'anydoor' + image_to_3d = 'image-to-3d' + self_supervised_depth_completion = 'self-supervised-depth-completion' + human_normal_estimation = 'human-normal-estimation' # nlp tasks automatic_post_editing = 'automatic-post-editing' @@ -522,21 +552,24 @@ class Pipelines(object): sambert_hifigan_tts = 'sambert-hifigan-tts' speech_dfsmn_aec_psm_16k = 'speech-dfsmn-aec-psm-16k' speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k' + speech_zipenhancer_ans_multiloss_16k_base = 'speech_zipenhancer_ans_multiloss_16k_base' speech_dfsmn_ans_psm_48k_causal = 'speech_dfsmn_ans_psm_48k_causal' speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield' speech_separation = 'speech-separation' kws_kwsbp = 'kws-kwsbp' - asr_inference = 'asr-inference' asr_wenet_inference = 'asr-wenet-inference' itn_inference = 'itn-inference' - punc_inference = 'punc-inference' - sv_inference = 'sv-inference' speaker_diarization_inference = 'speaker-diarization-inference' vad_inference = 'vad-inference' funasr_speech_separation = 'funasr-speech-separation' speaker_verification = 'speaker-verification' + speaker_verification_tdnn = 'speaker-verification-tdnn' speaker_verification_rdino = 'speaker-verification-rdino' + speaker_verification_sdpn = 'speaker-verification-sdpn' speaker_verification_eres2net = 'speaker-verification-eres2net' + speaker_verification_eres2netv2 = 'speaker-verification-eres2netv2' + speaker_verification_resnet = 'speaker-verification-resnet' + speaker_verification_res2net = 'speaker-verification-res2net' speech_language_recognition = 'speech-language-recognition' speech_language_recognition_eres2net = 'speech-language-recognition-eres2net' speaker_change_locating = 'speaker-change-locating' @@ -545,6 +578,9 @@ class Pipelines(object): segmentation_clustering = 'segmentation-clustering' lm_inference = 'language-score-prediction' speech_timestamp_inference = 'speech-timestamp-inference' + audio_quantization = 'audio-quantization' + audio_quantization_inference = 'audio-quantization-inference' + laura_codec_tts_inference = 'laura-codec-tts-inference' # multi-modal tasks image_captioning = 'image-captioning' @@ -584,6 +620,9 @@ class Pipelines(object): # science tasks protein_structure = 'unifold-protein-structure' + # funasr task + funasr_pipeline = 'funasr-pipeline' + DEFAULT_MODEL_FOR_PIPELINE = { # TaskName: (pipeline_module_name, model_repo) @@ -778,6 +817,12 @@ class Pipelines(object): Tasks.image_depth_estimation: (Pipelines.image_depth_estimation, 'damo/cv_newcrfs_image-depth-estimation_indoor'), + Tasks.image_normal_estimation: + (Pipelines.image_normal_estimation, + 'Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal'), + Tasks.human_normal_estimation: + (Pipelines.human_normal_estimation, + 'Damo_XR_Lab/cv_human_monocular-normal-estimation'), Tasks.indoor_layout_estimation: (Pipelines.indoor_layout_estimation, 'damo/cv_panovit_indoor-layout-estimation'), @@ -787,6 +832,12 @@ class Pipelines(object): Tasks.panorama_depth_estimation: (Pipelines.panorama_depth_estimation, 'damo/cv_unifuse_panorama-depth-estimation'), + Tasks.dense_optical_flow_estimation: + (Pipelines.dense_optical_flow_estimation, + 'Damo_XR_Lab/cv_raft_dense-optical-flow_things'), + Tasks.image_local_feature_matching: + (Pipelines.image_local_feature_matching, + 'Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data'), Tasks.image_style_transfer: (Pipelines.image_style_transfer, 'damo/cv_aams_style-transfer_damo'), Tasks.face_image_generation: (Pipelines.face_image_generation, @@ -804,20 +855,20 @@ class Pipelines(object): Tasks.image_to_image_generation: (Pipelines.image_to_image_generation, 'damo/cv_latent_diffusion_image2image_generate'), - Tasks.image_classification: - (Pipelines.daily_image_classification, - 'damo/cv_vit-base_image-classification_Dailylife-labels'), - Tasks.image_object_detection: - (Pipelines.image_object_detection_auto, - 'damo/cv_yolox_image-object-detection-auto'), - Tasks.ocr_recognition: - (Pipelines.ocr_recognition, - 'damo/cv_convnextTiny_ocr-recognition-general_damo'), + Tasks.image_classification: ( + Pipelines.daily_image_classification, + 'damo/cv_vit-base_image-classification_Dailylife-labels'), + Tasks.image_object_detection: ( + Pipelines.image_object_detection_auto, + 'damo/cv_yolox_image-object-detection-auto'), + Tasks.ocr_recognition: ( + Pipelines.ocr_recognition, + 'damo/cv_convnextTiny_ocr-recognition-general_damo'), Tasks.skin_retouching: (Pipelines.skin_retouching, 'damo/cv_unet_skin-retouching'), - Tasks.faq_question_answering: - (Pipelines.faq_question_answering, - 'damo/nlp_structbert_faq-question-answering_chinese-base'), + Tasks.faq_question_answering: ( + Pipelines.faq_question_answering, + 'damo/nlp_structbert_faq-question-answering_chinese-base'), Tasks.crowd_counting: (Pipelines.crowd_counting, 'damo/cv_hrnet_crowd-counting_dcanet'), Tasks.video_single_object_tracking: ( @@ -936,7 +987,10 @@ class Pipelines(object): 'damo/cv_image-view-transform'), Tasks.image_control_3d_portrait: ( Pipelines.image_control_3d_portrait, - 'damo/cv_vit_image-control-3d-portrait-synthesis') + 'damo/cv_vit_image-control-3d-portrait-synthesis'), + Tasks.self_supervised_depth_completion: ( + Pipelines.self_supervised_depth_completion, + 'damo/self-supervised-depth-completion') } @@ -959,6 +1013,7 @@ class CVTrainers(object): nerf_recon_4k = 'nerf-recon-4k' action_detection = 'action-detection' vision_efficient_tuning = 'vision-efficient-tuning' + self_supervised_depth_completion = 'self-supervised-depth-completion' class NLPTrainers(object): diff --git a/modelscope/metrics/__init__.py b/modelscope/metrics/__init__.py index 75ccfcf96..e95a22fe1 100644 --- a/modelscope/metrics/__init__.py +++ b/modelscope/metrics/__init__.py @@ -69,7 +69,9 @@ 'loss_metric': ['LossMetric'], 'image_colorization_metric': ['ImageColorizationMetric'], 'ocr_recognition_metric': ['OCRRecognitionMetric'], - 'translation_evaluation_metric': ['TranslationEvaluationMetric'] + 'translation_evaluation_metric': ['TranslationEvaluationMetric'], + 'video_super_resolution_metric.video_super_resolution_metric': + ['VideoSuperResolutionMetric'], } import sys diff --git a/modelscope/models/__init__.py b/modelscope/models/__init__.py index 24d86dfd5..e7e08ede9 100644 --- a/modelscope/models/__init__.py +++ b/modelscope/models/__init__.py @@ -1,9 +1,11 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import TYPE_CHECKING +from modelscope.utils.automodel_utils import fix_transformers_upgrade from modelscope.utils.error import (AUDIO_IMPORT_ERROR, TENSORFLOW_IMPORT_WARNING) -from modelscope.utils.import_utils import is_torch_available +from modelscope.utils.import_utils import (is_torch_available, + is_transformers_available) from . import audio, cv, multi_modal, nlp from .base import Head, Model from .builder import BACKBONES, HEADS, MODELS, build_model @@ -11,3 +13,6 @@ if is_torch_available(): from .base.base_torch_model import TorchModel from .base.base_torch_head import TorchHead + +if is_transformers_available(): + fix_transformers_upgrade() diff --git a/modelscope/models/audio/ans/zipenhancer.py b/modelscope/models/audio/ans/zipenhancer.py new file mode 100644 index 000000000..544d9dc74 --- /dev/null +++ b/modelscope/models/audio/ans/zipenhancer.py @@ -0,0 +1,210 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +import random +from typing import Dict + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.metainfo import Models +from modelscope.models import TorchModel +from modelscope.models.base import Tensor +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from .zipenhancer_layers.generator import (DenseEncoder, MappingDecoder, + PhaseDecoder) +from .zipenhancer_layers.scaling import ScheduledFloat +from .zipenhancer_layers.zipenhancer_layer import Zipformer2DualPathEncoder + + +@MODELS.register_module( + Tasks.acoustic_noise_suppression, + module_name=Models.speech_zipenhancer_ans_multiloss_16k_base) +class ZipenhancerDecorator(TorchModel): + + def __init__(self, model_dir: str, *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + + h = dict( + num_tsconformers=kwargs['num_tsconformers'], + dense_channel=kwargs['dense_channel'], + former_conf=kwargs['former_conf'], + batch_first=kwargs['batch_first'], + model_num_spks=kwargs['model_num_spks'], + ) + # num_tsconformers, dense_channel, former_name, former_conf, batch_first, model_num_spks + + h = AttrDict(h) + self.model = ZipEnhancer(h) + model_bin_file = os.path.join(model_dir, + ModelFile.TORCH_MODEL_BIN_FILE) + if os.path.exists(model_bin_file): + checkpoint = torch.load( + model_bin_file, map_location=torch.device('cpu')) + if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: + # the new trained model by user is based on ZipenhancerDecorator + self.load_state_dict(checkpoint['state_dict']) + else: + # The released model on Modelscope is based on Zipenhancer + # self.model.load_state_dict(checkpoint, strict=False) + self.model.load_state_dict(checkpoint['generator']) + # print(checkpoint['generator'].keys()) + + def forward(self, inputs: Dict[str, Tensor]) -> Dict[str, Tensor]: + n_fft = 400 + hop_size = 100 + win_size = 400 + noisy_wav = inputs['noisy'] + norm_factor = torch.sqrt(noisy_wav.shape[1] + / torch.sum(noisy_wav**2.0)) + noisy_audio = (noisy_wav * norm_factor) + + mag, pha, com = mag_pha_stft( + noisy_audio, + n_fft, + hop_size, + win_size, + compress_factor=0.3, + center=True) + amp_g, pha_g, com_g, _, others = self.model.forward(mag, pha) + wav = mag_pha_istft( + amp_g, + pha_g, + n_fft, + hop_size, + win_size, + compress_factor=0.3, + center=True) + + wav = wav / norm_factor + + output = { + 'wav_l2': wav, + } + + return output + + +class ZipEnhancer(nn.Module): + + def __init__(self, h): + """ + Initialize the ZipEnhancer module. + + Args: + h (object): Configuration object containing various hyperparameters and settings. + having num_tsconformers, former_name, former_conf, mask_decoder_type, ... + """ + super(ZipEnhancer, self).__init__() + self.h = h + + num_tsconformers = h.num_tsconformers + self.num_tscblocks = num_tsconformers + self.dense_encoder = DenseEncoder(h, in_channel=2) + + self.TSConformer = Zipformer2DualPathEncoder( + output_downsampling_factor=1, + dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)), + **h.former_conf) + + self.mask_decoder = MappingDecoder(h, out_channel=h.model_num_spks) + self.phase_decoder = PhaseDecoder(h, out_channel=h.model_num_spks) + + def forward(self, noisy_mag, noisy_pha): # [B, F, T] + """ + Forward pass of the ZipEnhancer module. + + Args: + noisy_mag (Tensor): Noisy magnitude input tensor of shape [B, F, T]. + noisy_pha (Tensor): Noisy phase input tensor of shape [B, F, T]. + + Returns: + Tuple: denoised magnitude, denoised phase, denoised complex representation, + (optional) predicted noise components, and other auxiliary information. + """ + others = dict() + + noisy_mag = noisy_mag.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F] + noisy_pha = noisy_pha.unsqueeze(-1).permute(0, 3, 2, 1) # [B, 1, T, F] + x = torch.cat((noisy_mag, noisy_pha), dim=1) # [B, 2, T, F] + x = self.dense_encoder(x) + + # [B, C, T, F] + x = self.TSConformer(x) + + pred_mag = self.mask_decoder(x) + pred_pha = self.phase_decoder(x) + # b, c, t, f -> b, 1, t, f -> b, f, t, 1 -> b, f, t + denoised_mag = pred_mag[:, 0, :, :].unsqueeze(1).permute(0, 3, 2, + 1).squeeze(-1) + + # b, t, f + denoised_pha = pred_pha[:, 0, :, :].unsqueeze(1).permute(0, 3, 2, + 1).squeeze(-1) + # b, t, f + denoised_com = torch.stack((denoised_mag * torch.cos(denoised_pha), + denoised_mag * torch.sin(denoised_pha)), + dim=-1) + + return denoised_mag, denoised_pha, denoised_com, None, others + + +class AttrDict(dict): + + def __init__(self, *args, **kwargs): + super(AttrDict, self).__init__(*args, **kwargs) + self.__dict__ = self + + +def mag_pha_stft(y, + n_fft, + hop_size, + win_size, + compress_factor=1.0, + center=True): + hann_window = torch.hann_window(win_size, device=y.device) + stft_spec = torch.stft( + y, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center, + pad_mode='reflect', + normalized=False, + return_complex=True) + stft_spec = torch.view_as_real(stft_spec) + mag = torch.sqrt(stft_spec.pow(2).sum(-1) + (1e-9)) + pha = torch.atan2(stft_spec[:, :, :, 1], stft_spec[:, :, :, 0] + (1e-5)) + # Magnitude Compression + mag = torch.pow(mag, compress_factor) + com = torch.stack((mag * torch.cos(pha), mag * torch.sin(pha)), dim=-1) + + return mag, pha, com + + +def mag_pha_istft(mag, + pha, + n_fft, + hop_size, + win_size, + compress_factor=1.0, + center=True): + # Magnitude Decompression + mag = torch.pow(mag, (1.0 / compress_factor)) + com = torch.complex(mag * torch.cos(pha), mag * torch.sin(pha)) + hann_window = torch.hann_window(win_size, device=com.device) + + wav = torch.istft( + com, + n_fft, + hop_length=hop_size, + win_length=win_size, + window=hann_window, + center=center) + return wav diff --git a/modelscope/models/audio/punc/__init__.py b/modelscope/models/audio/ans/zipenhancer_layers/__init__.py similarity index 100% rename from modelscope/models/audio/punc/__init__.py rename to modelscope/models/audio/ans/zipenhancer_layers/__init__.py diff --git a/modelscope/models/audio/ans/zipenhancer_layers/generator.py b/modelscope/models/audio/ans/zipenhancer_layers/generator.py new file mode 100644 index 000000000..8332ba4d8 --- /dev/null +++ b/modelscope/models/audio/ans/zipenhancer_layers/generator.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Alibaba, Inc. and its affiliates. +# Part of the implementation is borrowed and modified from MP-SENet, +# public available at https://github.com/yxlu-0102/MP-SENet + +import random + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SubPixelConvTranspose2d(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size=(1, 3), + stride=(1, 2), + padding=(0, 1)): + super(SubPixelConvTranspose2d, self).__init__() + self.upscale_width_factor = stride[1] + self.conv1 = nn.Conv2d( + in_channels, + out_channels * self.upscale_width_factor, + kernel_size=kernel_size, + padding=padding) # only change the width + + def forward(self, x): + + b, c, t, f = x.size() + # Use conv1 for upsampling, followed by expansion only in the width dimension. + x = self.conv1(x) + # print(x.size()) + # Note: Here we do not directly use PixelShuffle because we only intend to expand in the width dimension, + # whereas PixelShuffle operates simultaneously on both height and width, hence we manually adjust accordingly. + # b, 2c, t, f + # print(x.size()) + x = x.view(b, c, self.upscale_width_factor, t, + f).permute(0, 1, 3, 4, 2).contiguous() + # b, c, 2, t, f -> b, c, t, f, 2 + x = x.view(b, c, t, f * self.upscale_width_factor) + # b, c, t, 2f = 202 + # x = nn.functional.pad(x, (0, 1)) + # b, c, t, 2f = 202 + + return x + + +class DenseBlockV2(nn.Module): + """ + A denseblock for ZipEnhancer + """ + + def __init__(self, h, kernel_size=(2, 3), depth=4): + super(DenseBlockV2, self).__init__() + self.h = h + self.depth = depth + self.dense_block = nn.ModuleList([]) + for i in range(depth): + dil = 2**i + pad_length = kernel_size[0] + (dil - 1) * (kernel_size[0] - 1) - 1 + dense_conv = nn.Sequential( + nn.ConstantPad2d((1, 1, pad_length, 0), value=0.), + nn.Conv2d( + h.dense_channel * (i + 1), + h.dense_channel, + kernel_size, + dilation=(dil, 1)), + # nn.Conv2d(h.dense_channel * (i + 1), h.dense_channel, kernel_size, dilation=(dil, 1), + # padding=get_padding_2d(kernel_size, (dil, 1))), + nn.InstanceNorm2d(h.dense_channel, affine=True), + nn.PReLU(h.dense_channel)) + self.dense_block.append(dense_conv) + + def forward(self, x): + skip = x + # b, c, t, f + for i in range(self.depth): + _x = skip + x = self.dense_block[i](_x) + # print(x.size()) + skip = torch.cat([x, skip], dim=1) + return x + + +class DenseEncoder(nn.Module): + + def __init__(self, h, in_channel): + """ + Initialize the DenseEncoder module. + + Args: + h (object): Configuration object containing various hyperparameters and settings. + in_channel (int): Number of input channels. Example: mag + phase: 2 channels + """ + super(DenseEncoder, self).__init__() + self.h = h + self.dense_conv_1 = nn.Sequential( + nn.Conv2d(in_channel, h.dense_channel, (1, 1)), + nn.InstanceNorm2d(h.dense_channel, affine=True), + nn.PReLU(h.dense_channel)) + + self.dense_block = DenseBlockV2(h, depth=4) + + encoder_pad_kersize = (0, 1) + # Here pad was originally (0, 0),now change to (0, 1) + self.dense_conv_2 = nn.Sequential( + nn.Conv2d( + h.dense_channel, + h.dense_channel, (1, 3), (1, 2), + padding=encoder_pad_kersize), + nn.InstanceNorm2d(h.dense_channel, affine=True), + nn.PReLU(h.dense_channel)) + + def forward(self, x): + """ + Forward pass of the DenseEncoder module. + + Args: + x (Tensor): Input tensor of shape [B, C=in_channel, T, F]. + + Returns: + Tensor: Output tensor after passing through the dense encoder. Maybe: [B, C=dense_channel, T, F // 2]. + """ + # print("x: {}".format(x.size())) + x = self.dense_conv_1(x) # [b, 64, T, F] + if self.dense_block is not None: + x = self.dense_block(x) # [b, 64, T, F] + x = self.dense_conv_2(x) # [b, 64, T, F//2] + return x + + +class BaseDecoder(nn.Module): + + def __init__(self, h): + """ + Initialize the BaseDecoder module. + + Args: + h (object): Configuration object containing various hyperparameters and settings. + including upsample_type, dense_block_type. + """ + super(BaseDecoder, self).__init__() + + self.upsample_module_class = SubPixelConvTranspose2d + + # for both mag and phase decoder + self.dense_block = DenseBlockV2(h, depth=4) + + +class MappingDecoder(BaseDecoder): + + def __init__(self, h, out_channel=1): + """ + Initialize the MappingDecoderV3 module. + + Args: + h (object): Configuration object containing various hyperparameters and settings. + out_channel (int): Number of output channels. Default is 1. The number of output spearkers. + """ + super(MappingDecoder, self).__init__(h) + decoder_final_kersize = (1, 2) + + self.mask_conv = nn.Sequential( + self.upsample_module_class(h.dense_channel, h.dense_channel, + (1, 3), (1, 2)), + # nn.Conv2d(h.dense_channel, out_channel, (1, 1)), + nn.InstanceNorm2d(h.dense_channel, affine=True), + nn.PReLU(h.dense_channel), + nn.Conv2d(h.dense_channel, out_channel, decoder_final_kersize)) + # Upsample at F dimension + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """ + Forward pass of the MappingDecoderV3 module. + + Args: + x (Tensor): Input tensor. [B, C, T, F] + + Returns: + Tensor: Output tensor after passing through the decoder. [B, Num_Spks, T, F] + """ + if self.dense_block is not None: + x = self.dense_block(x) + x = self.mask_conv(x) + x = self.relu(x) + # b, c=1, t, f + return x + + +class PhaseDecoder(BaseDecoder): + + def __init__(self, h, out_channel=1): + super(PhaseDecoder, self).__init__(h) + + # now change to (1, 2), previous (1, 1) + decoder_final_kersize = (1, 2) + + self.phase_conv = nn.Sequential( + self.upsample_module_class(h.dense_channel, h.dense_channel, + (1, 3), (1, 2)), + nn.InstanceNorm2d(h.dense_channel, affine=True), + nn.PReLU(h.dense_channel)) + self.phase_conv_r = nn.Conv2d(h.dense_channel, out_channel, + decoder_final_kersize) + self.phase_conv_i = nn.Conv2d(h.dense_channel, out_channel, + decoder_final_kersize) + + def forward(self, x): + if self.dense_block is not None: + x = self.dense_block(x) + x = self.phase_conv(x) + x_r = self.phase_conv_r(x) + x_i = self.phase_conv_i(x) + x = torch.atan2(x_i, x_r) + return x diff --git a/modelscope/models/audio/ans/zipenhancer_layers/scaling.py b/modelscope/models/audio/ans/zipenhancer_layers/scaling.py new file mode 100644 index 000000000..06dfc2bd6 --- /dev/null +++ b/modelscope/models/audio/ans/zipenhancer_layers/scaling.py @@ -0,0 +1,1055 @@ +# Copyright 2022-2023 Xiaomi Corp. (authors: Daniel Povey) +# Copyright (c) 2024 Alibaba, Inc. and its affiliates. +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +import math +import random +from typing import Optional, Tuple, Union + +# import k2 +import torch +import torch.nn as nn +from torch import Tensor +from torch.cuda.amp import custom_bwd, custom_fwd + + +def logaddexp_onnx(x: Tensor, y: Tensor) -> Tensor: + max_value = torch.max(x, y) + diff = torch.abs(x - y) + return max_value + torch.log1p(torch.exp(-diff)) + + +# RuntimeError: Exporting the operator logaddexp to ONNX opset version +# 14 is not supported. Please feel free to request support or submit +# a pull request on PyTorch GitHub. +# +# The following function is to solve the above error when exporting +# models to ONNX via torch.jit.trace() +def logaddexp(x: Tensor, y: Tensor) -> Tensor: + # Caution(fangjun): Put torch.jit.is_scripting() before + # torch.onnx.is_in_onnx_export(); + # otherwise, it will cause errors for torch.jit.script(). + # + # torch.logaddexp() works for both torch.jit.script() and + # torch.jit.trace() but it causes errors for ONNX export. + # + if torch.jit.is_scripting(): + # Note: We cannot use torch.jit.is_tracing() here as it also + # matches torch.onnx.export(). + return torch.logaddexp(x, y) + elif torch.onnx.is_in_onnx_export(): + return logaddexp_onnx(x, y) + else: + # for torch.jit.trace() + return torch.logaddexp(x, y) + + +class PiecewiseLinear(object): + """ + Piecewise linear function, from float to float, specified as nonempty list of (x,y) pairs with + the x values in order. x values <[initial x] or >[final x] are map to [initial y], [final y] + respectively. + """ + + def __init__(self, *args): + assert len(args) >= 1, len(args) + if len(args) == 1 and isinstance(args[0], PiecewiseLinear): + self.pairs = list(args[0].pairs) + else: + self.pairs = [(float(x), float(y)) for x, y in args] + for x, y in self.pairs: + assert isinstance(x, (float, int)), type(x) + assert isinstance(y, (float, int)), type(y) + + for i in range(len(self.pairs) - 1): + assert self.pairs[i + 1][0] > self.pairs[i][0], ( + i, + self.pairs[i], + self.pairs[i + 1], + ) + + def __str__(self): + # e.g. 'PiecewiseLinear((0., 10.), (100., 0.))' + return f'PiecewiseLinear({str(self.pairs)[1:-1]})' + + def __call__(self, x): + if x <= self.pairs[0][0]: + return self.pairs[0][1] + elif x >= self.pairs[-1][0]: + return self.pairs[-1][1] + else: + cur_x, cur_y = self.pairs[0] + for i in range(1, len(self.pairs)): + next_x, next_y = self.pairs[i] + if x >= cur_x and x <= next_x: + return cur_y + (next_y - cur_y) * (x - cur_x) / ( + next_x - cur_x) + cur_x, cur_y = next_x, next_y + assert False + + def __mul__(self, alpha): + return PiecewiseLinear(*[(x, y * alpha) for x, y in self.pairs]) + + def __add__(self, x): + if isinstance(x, (float, int)): + return PiecewiseLinear(*[(p[0], p[1] + x) for p in self.pairs]) + s, x = self.get_common_basis(x) + return PiecewiseLinear(*[(sp[0], sp[1] + xp[1]) + for sp, xp in zip(s.pairs, x.pairs)]) + + def max(self, x): + if isinstance(x, (float, int)): + x = PiecewiseLinear((0, x)) + s, x = self.get_common_basis(x, include_crossings=True) + return PiecewiseLinear(*[(sp[0], max(sp[1], xp[1])) + for sp, xp in zip(s.pairs, x.pairs)]) + + def min(self, x): + if isinstance(x, float) or isinstance(x, int): + x = PiecewiseLinear((0, x)) + s, x = self.get_common_basis(x, include_crossings=True) + return PiecewiseLinear(*[(sp[0], min(sp[1], xp[1])) + for sp, xp in zip(s.pairs, x.pairs)]) + + def __eq__(self, other): + return self.pairs == other.pairs + + def get_common_basis(self, + p: 'PiecewiseLinear', + include_crossings: bool = False): + """ + Returns (self_mod, p_mod) which are equivalent piecewise linear + functions to self and p, but with the same x values. + + p: the other piecewise linear function + include_crossings: if true, include in the x values positions + where the functions indicate by this and p cross. + """ + assert isinstance(p, PiecewiseLinear), type(p) + + # get sorted x-values without repetition. + x_vals = sorted( + set([x for x, _ in self.pairs] + [x for x, _ in p.pairs])) + y_vals1 = [self(x) for x in x_vals] + y_vals2 = [p(x) for x in x_vals] + + if include_crossings: + extra_x_vals = [] + for i in range(len(x_vals) - 1): + _compare_results1 = (y_vals1[i] > y_vals2[i]) + _compare_results2 = (y_vals1[i + 1] > y_vals2[i + 1]) + if _compare_results1 != _compare_results2: + # if ((y_vals1[i] > y_vals2[i]) != + # (y_vals1[i + 1] > y_vals2[i + 1])): + # if the two lines in this subsegment potentially cross each other. + diff_cur = abs(y_vals1[i] - y_vals2[i]) + diff_next = abs(y_vals1[i + 1] - y_vals2[i + 1]) + # `pos`, between 0 and 1, gives the relative x position, + # with 0 being x_vals[i] and 1 being x_vals[i+1]. + pos = diff_cur / (diff_cur + diff_next) + extra_x_val = x_vals[i] + pos * (x_vals[i + 1] - x_vals[i]) + extra_x_vals.append(extra_x_val) + if len(extra_x_vals) > 0: + x_vals = sorted(set(x_vals + extra_x_vals)) + y_vals1 = [self(x) for x in x_vals] + y_vals2 = [p(x) for x in x_vals] + return ( + PiecewiseLinear(*zip(x_vals, y_vals1)), + PiecewiseLinear(*zip(x_vals, y_vals2)), + ) + + +class ScheduledFloat(torch.nn.Module): + """ + This object is a torch.nn.Module only because we want it to show up in [top_level module].modules(); + it does not have a working forward() function. You are supposed to cast it to float, as + in, float(parent_module.whatever), and use it as something like a dropout prob. + + It is a floating point value whose value changes depending on the batch count of the + training loop. It is a piecewise linear function where you specify the (x,y) pairs + in sorted order on x; x corresponds to the batch index. For batch-index values before the + first x or after the last x, we just use the first or last y value. + + Example: + self.dropout = ScheduledFloat((0.0, 0.2), (4000.0, 0.0), default=0.0) + + `default` is used when self.batch_count is not set or not in training mode or in + torch.jit scripting mode. + """ + + def __init__(self, *args, default: float = 0.0): + super().__init__() + # self.batch_count and self.name will be written to in the training loop. + self.batch_count = None + self.name = None + self.default = default + self.schedule = PiecewiseLinear(*args) + + def extra_repr(self) -> str: + return ( + f'batch_count={self.batch_count}, schedule={str(self.schedule.pairs[1:-1])}' + ) + + def __float__(self): + batch_count = self.batch_count + if (batch_count is None or not self.training + or torch.jit.is_scripting() or torch.jit.is_tracing()): + return float(self.default) + else: + ans = self.schedule(self.batch_count) + if random.random() < 0.0002: + logging.info( + f'ScheduledFloat: name={self.name}, batch_count={self.batch_count}, ans={ans}' + ) + return ans + + def __add__(self, x): + if isinstance(x, float) or isinstance(x, int): + return ScheduledFloat(self.schedule + x, default=self.default) + else: + return ScheduledFloat( + self.schedule + x.schedule, default=self.default + x.default) + + def max(self, x): + if isinstance(x, float) or isinstance(x, int): + return ScheduledFloat(self.schedule.max(x), default=self.default) + else: + return ScheduledFloat( + self.schedule.max(x.schedule), + default=max(self.default, x.default)) + + +FloatLike = Union[float, ScheduledFloat] + + +class SoftmaxFunction(torch.autograd.Function): + """ + Tries to handle half-precision derivatives in a randomized way that should + be more accurate for training than the default behavior. + """ + + @staticmethod + def forward(ctx, x: Tensor, dim: int): + ans = x.softmax(dim=dim) + # if x dtype is float16, x.softmax() returns a float32 because + # (presumably) that op does not support float16, and autocast + # is enabled. + if torch.is_autocast_enabled(): + ans = ans.to(torch.float16) + ctx.save_for_backward(ans) + ctx.x_dtype = x.dtype + ctx.dim = dim + return ans + + @staticmethod + def backward(ctx, ans_grad: Tensor): + (ans, ) = ctx.saved_tensors + with torch.cuda.amp.autocast(enabled=False): + ans_grad = ans_grad.to(torch.float32) + ans = ans.to(torch.float32) + x_grad = ans_grad * ans + x_grad = x_grad - ans * x_grad.sum(dim=ctx.dim, keepdim=True) + return x_grad, None + + +def inplace_softmax(tensor, dim): + # Subtract the maximum value from each Tensor to prevent overflow. + max_vals, _ = tensor.max(dim=dim, keepdim=True) + tensor.sub_(max_vals) + + # # calculate logsumexp + # log_sum_exp = torch.logsumexp(tensor, dim=dim, keepdim=True) + # + # # minus logsumexp + # tensor.sub_(log_sum_exp) + # + # # Compute the exponential of each element, and store the results in-place. + # tensor.exp_() + + # Compute the exponential of each element, and store the results in-place. + tensor.exp_() + + # Compute the sum along the specified dimension, and store the result in-place. + sum_exp = tensor.sum(dim=dim, keepdim=True) + + # Divide each element by the sum along that dimension, and store the result in-place. + tensor.div_(sum_exp) + # tensor.add_(1e-8) + + return tensor + + +def softmax(x: Tensor, dim: int): + if not x.requires_grad or torch.jit.is_scripting() or torch.jit.is_tracing( + ): + # return x.softmax(dim=dim) + # inplace operator + return inplace_softmax(x, dim) + + return SoftmaxFunction.apply(x, dim) + + +class BiasNormFunction(torch.autograd.Function): + # This computes: + # scales = (torch.mean((x - bias) ** 2, keepdim=True)) ** -0.5 * log_scale.exp() + # return x * scales + # (after unsqueezing the bias), but it does it in a memory-efficient way so that + # it can just store the returned value (chances are, this will also be needed for + # some other reason, related to the next operation, so we can save memory). + @staticmethod + def forward( + ctx, + x: Tensor, + bias: Tensor, + log_scale: Tensor, + channel_dim: int, + store_output_for_backprop: bool, + ) -> Tensor: + assert bias.ndim == 1 + if channel_dim < 0: + channel_dim = channel_dim + x.ndim + ctx.store_output_for_backprop = store_output_for_backprop + ctx.channel_dim = channel_dim + for _ in range(channel_dim + 1, x.ndim): + bias = bias.unsqueeze(-1) + _x_bias_square = torch.mean( + (x - bias)**2, dim=channel_dim, keepdim=True) + scales = (_x_bias_square**-0.5) * log_scale.exp() + ans = x * scales + ctx.save_for_backward( + ans.detach() if store_output_for_backprop else x, + scales.detach(), + bias.detach(), + log_scale.detach(), + ) + return ans + + @staticmethod + def backward(ctx, ans_grad: Tensor) -> Tensor: + ans_or_x, scales, bias, log_scale = ctx.saved_tensors + if ctx.store_output_for_backprop: + x = ans_or_x / scales + else: + x = ans_or_x + x = x.detach() + x.requires_grad = True + bias.requires_grad = True + log_scale.requires_grad = True + with torch.enable_grad(): + # recompute scales from x, bias and log_scale. + _x_bias_square = torch.mean( + (x - bias)**2, dim=ctx.channel_dim, keepdim=True) + scales = (_x_bias_square**-0.5) * log_scale.exp() + ans = x * scales + ans.backward(gradient=ans_grad) + return x.grad, bias.grad.flatten(), log_scale.grad, None, None + + +class BiasNorm(torch.nn.Module): + """ + This is intended to be a simpler, and hopefully cheaper, replacement for + LayerNorm. The observation this is based on, is that Transformer-type + networks, especially with pre-norm, sometimes seem to set one of the + feature dimensions to a large constant value (e.g. 50), which "defeats" + the LayerNorm because the output magnitude is then not strongly dependent + on the other (useful) features. Presumably the weight and bias of the + LayerNorm are required to allow it to do this. + + Instead, we give the BiasNorm a trainable bias that it can use when + computing the scale for normalization. We also give it a (scalar) + trainable scale on the output. + + + Args: + num_channels: the number of channels, e.g. 512. + channel_dim: the axis/dimension corresponding to the channel, + interpreted as an offset from the input's ndim if negative. + This is NOT the num_channels; it should typically be one of + {-2, -1, 0, 1, 2, 3}. + log_scale: the initial log-scale that we multiply the output by; this + is learnable. + log_scale_min: FloatLike, minimum allowed value of log_scale + log_scale_max: FloatLike, maximum allowed value of log_scale + store_output_for_backprop: only possibly affects memory use; recommend + to set to True if you think the output of this module is more likely + than the input of this module to be required to be stored for the + backprop. + """ + + def __init__( + self, + num_channels: int, + channel_dim: int = -1, # CAUTION: see documentation. + log_scale: float = 1.0, + log_scale_min: float = -1.5, + log_scale_max: float = 1.5, + store_output_for_backprop: bool = False, + ) -> None: + super(BiasNorm, self).__init__() + self.num_channels = num_channels + self.channel_dim = channel_dim + self.log_scale = nn.Parameter(torch.tensor(log_scale)) + self.bias = nn.Parameter( + torch.empty(num_channels).normal_(mean=0, std=1e-4)) + + self.log_scale_min = log_scale_min + self.log_scale_max = log_scale_max + + self.store_output_for_backprop = store_output_for_backprop + + def forward(self, x: Tensor) -> Tensor: + assert x.shape[self.channel_dim] == self.num_channels + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + channel_dim = self.channel_dim + if channel_dim < 0: + channel_dim += x.ndim + bias = self.bias + for _ in range(channel_dim + 1, x.ndim): + bias = bias.unsqueeze(-1) + _x_bias_square = torch.mean( + (x - bias)**2, dim=channel_dim, keepdim=True) + scales = (_x_bias_square**-0.5) * self.log_scale.exp() + return x * scales + + log_scale = limit_param_value( + self.log_scale, + min=float(self.log_scale_min), + max=float(self.log_scale_max), + training=self.training, + ) + + return BiasNormFunction.apply(x, self.bias, log_scale, + self.channel_dim, + self.store_output_for_backprop) + + +def ScaledLinear(*args, initial_scale: float = 1.0, **kwargs) -> nn.Linear: + """ + Behaves like a constructor of a modified version of nn.Linear + that gives an easy way to set the default initial parameter scale. + + Args: + Accepts the standard args and kwargs that nn.Linear accepts + e.g. in_features, out_features, bias=False. + + initial_scale: you can override this if you want to increase + or decrease the initial magnitude of the module's output + (affects the initialization of weight_scale and bias_scale). + Another option, if you want to do something like this, is + to re-initialize the parameters. + """ + ans = nn.Linear(*args, **kwargs) + with torch.no_grad(): + ans.weight[:] *= initial_scale + if ans.bias is not None: + torch.nn.init.uniform_(ans.bias, -0.1 * initial_scale, + 0.1 * initial_scale) + return ans + + +class ChunkCausalDepthwiseConv1d(torch.nn.Module): + """ + Behaves like a depthwise 1d convolution, except that it is causal in + a chunkwise way, as if we had a block-triangular attention mask. + The chunk size is provided at test time (it should probably be + kept in sync with the attention mask). + + This has a little more than twice the parameters of a conventional + depthwise conv1d module: we implement it by having one + depthwise convolution, of half the width, that is causal (via + right-padding); and one depthwise convolution that is applied only + within chunks, that we multiply by a scaling factor which depends + on the position within the chunk. + + Args: + Accepts the standard args and kwargs that nn.Linear accepts + e.g. in_features, out_features, bias=False. + + initial_scale: you can override this if you want to increase + or decrease the initial magnitude of the module's output + (affects the initialization of weight_scale and bias_scale). + Another option, if you want to do something like this, is + to re-initialize the parameters. + """ + + def __init__( + self, + channels: int, + kernel_size: int, + initial_scale: float = 1.0, + bias: bool = True, + ): + super().__init__() + assert kernel_size % 2 == 1 + + half_kernel_size = (kernel_size + 1) // 2 + # will pad manually, on one side. + self.causal_conv = nn.Conv1d( + in_channels=channels, + out_channels=channels, + groups=channels, + kernel_size=half_kernel_size, + padding=0, + bias=True, + ) + + self.chunkwise_conv = nn.Conv1d( + in_channels=channels, + out_channels=channels, + groups=channels, + kernel_size=kernel_size, + padding=kernel_size // 2, + bias=bias, + ) + + # first row is correction factors added to the scale near the left edge of the chunk, + # second row is correction factors added to the scale near the right edge of the chunk, + # both of these are added to a default scale of 1.0. + self.chunkwise_conv_scale = nn.Parameter( + torch.zeros(2, channels, kernel_size)) + self.kernel_size = kernel_size + + with torch.no_grad(): + self.causal_conv.weight[:] *= initial_scale + self.chunkwise_conv.weight[:] *= initial_scale + if bias: + torch.nn.init.uniform_(self.causal_conv.bias, + -0.1 * initial_scale, + 0.1 * initial_scale) + + def forward(self, x: Tensor, chunk_size: int = -1) -> Tensor: + """Forward function. + + Args: + x: a Tensor of shape (batch_size, channels, seq_len) + chunk_size: the chunk size, in frames; does not have to divide seq_len exactly. + """ + (batch_size, num_channels, seq_len) = x.shape + + # half_kernel_size = self.kernel_size + 1 // 2 + # left_pad is half_kernel_size - 1 where half_kernel_size is the size used + # in the causal conv. It's the amount by which we must pad on the left, + # to make the convolution causal. + left_pad = self.kernel_size // 2 + + if chunk_size < 0 or chunk_size > seq_len: + chunk_size = seq_len + right_pad = -seq_len % chunk_size + + x = torch.nn.functional.pad(x, (left_pad, right_pad)) + + x_causal = self.causal_conv(x[..., :left_pad + seq_len]) + assert x_causal.shape == (batch_size, num_channels, seq_len) + + x_chunk = x[..., left_pad:] + num_chunks = x_chunk.shape[2] // chunk_size + x_chunk = x_chunk.reshape(batch_size, num_channels, num_chunks, + chunk_size) + x_chunk = x_chunk.permute(0, 2, 1, 3).reshape(batch_size * num_chunks, + num_channels, chunk_size) + x_chunk = self.chunkwise_conv(x_chunk) # does not change shape + + chunk_scale = self._get_chunk_scale(chunk_size) + + x_chunk = x_chunk * chunk_scale + x_chunk = x_chunk.reshape(batch_size, num_chunks, num_channels, + chunk_size).permute(0, 2, 1, 3) + x_chunk = x_chunk.reshape(batch_size, num_channels, + num_chunks * chunk_size)[..., :seq_len] + + return x_chunk + x_causal + + def _get_chunk_scale(self, chunk_size: int): + """Returns tensor of shape (num_channels, chunk_size) that will be used to + scale the output of self.chunkwise_conv.""" + left_edge = self.chunkwise_conv_scale[0] + right_edge = self.chunkwise_conv_scale[1] + if chunk_size < self.kernel_size: + left_edge = left_edge[:, :chunk_size] + right_edge = right_edge[:, -chunk_size:] + else: + t = chunk_size - self.kernel_size + channels = left_edge.shape[0] + pad = torch.zeros( + channels, t, device=left_edge.device, dtype=left_edge.dtype) + left_edge = torch.cat((left_edge, pad), dim=-1) + right_edge = torch.cat((pad, right_edge), dim=-1) + return 1.0 + (left_edge + right_edge) + + def streaming_forward( + self, + x: Tensor, + cache: Tensor, + ) -> Tuple[Tensor, Tensor]: + """Streaming Forward function. + + Args: + x: a Tensor of shape (batch_size, channels, seq_len) + cache: cached left context of shape (batch_size, channels, left_pad) + """ + (batch_size, num_channels, seq_len) = x.shape + + # left_pad is half_kernel_size - 1 where half_kernel_size is the size used + # in the causal conv. It's the amount by which we must pad on the left, + # to make the convolution causal. + left_pad = self.kernel_size // 2 + + # Pad cache + assert cache.shape[-1] == left_pad, (cache.shape[-1], left_pad) + x = torch.cat([cache, x], dim=2) + # Update cache + cache = x[..., -left_pad:] + + x_causal = self.causal_conv(x) + assert x_causal.shape == (batch_size, num_channels, seq_len) + + x_chunk = x[..., left_pad:] + x_chunk = self.chunkwise_conv(x_chunk) # does not change shape + + chunk_scale = self._get_chunk_scale(chunk_size=seq_len) + x_chunk = x_chunk * chunk_scale + + return x_chunk + x_causal, cache + + +def penalize_abs_values_gt(x: Tensor, + limit: float, + penalty: float, + name: str = None) -> Tensor: + """ + Returns x unmodified, but in backprop will put a penalty for the excess of + the absolute values of elements of x over the limit "limit". E.g. if + limit == 10.0, then if x has any values over 10 it will get a penalty. + + Caution: the value of this penalty will be affected by grad scaling used + in automatic mixed precision training. For this reasons we use this, + it shouldn't really matter, or may even be helpful; we just use this + to disallow really implausible values of scores to be given to softmax. + + The name is for randomly printed debug info. + """ + x_sign = x.sign() + over_limit = (x.abs() - limit) > 0 + # The following is a memory efficient way to penalize the absolute values of + # x that's over the limit. (The memory efficiency comes when you think + # about which items torch needs to cache for the autograd, and which ones it + # can throw away). The numerical value of aux_loss as computed here will + # actually be larger than it should be, by limit * over_limit.sum(), but it + # has the same derivative as the real aux_loss which is penalty * (x.abs() - + # limit).relu(). + aux_loss = penalty * ((x_sign * over_limit).to(torch.int8) * x) + # note: we don't do sum() here on aux)_loss, but it's as if we had done + # sum() due to how with_loss() works. + x = with_loss(x, aux_loss, name) + # you must use x for something, or this will be ineffective. + return x + + +class WithLoss(torch.autograd.Function): + + @staticmethod + def forward(ctx, x: Tensor, y: Tensor, name: str): + ctx.y_shape = y.shape + if random.random() < 0.002 and name is not None: + loss_sum = y.sum().item() + logging.info(f'WithLoss: name={name}, loss-sum={loss_sum:.3e}') + return x + + @staticmethod + def backward(ctx, ans_grad: Tensor): + return ( + ans_grad, + torch.ones( + ctx.y_shape, dtype=ans_grad.dtype, device=ans_grad.device), + None, + ) + + +def with_loss(x, y, name): + # returns x but adds y.sum() to the loss function. + return WithLoss.apply(x, y, name) + + +class LimitParamValue(torch.autograd.Function): + + @staticmethod + def forward(ctx, x: Tensor, min: float, max: float): + ctx.save_for_backward(x) + assert max >= min + ctx.min = min + ctx.max = max + return x + + @staticmethod + def backward(ctx, x_grad: Tensor): + (x, ) = ctx.saved_tensors + # where x < ctx.min, ensure all grads are negative (this will tend to make + # x more positive). + x_grad = x_grad * torch.where( + torch.logical_and(x_grad > 0, x < ctx.min), -1.0, 1.0) + # where x > ctx.max, ensure all grads are positive (this will tend to make + # x more negative). + x_grad *= torch.where( + torch.logical_and(x_grad < 0, x > ctx.max), -1.0, 1.0) + return x_grad, None, None + + +def limit_param_value(x: Tensor, + min: float, + max: float, + prob: float = 0.6, + training: bool = True): + # You apply this to (typically) an nn.Parameter during training to ensure that its + # (elements mostly) stays within a supplied range. This is done by modifying the + # gradients in backprop. + # It's not necessary to do this on every batch: do it only some of the time, + # to save a little time. + if training and random.random() < prob: + return LimitParamValue.apply(x, min, max) + else: + return x + + +def _no_op(x: Tensor) -> Tensor: + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return x + else: + # a no-op function that will have a node in the autograd graph, + # to avoid certain bugs relating to backward hooks + return x.chunk(1, dim=-1)[0] + + +class Identity(torch.nn.Module): + + def __init__(self): + super(Identity, self).__init__() + + def forward(self, x): + return _no_op(x) + + +# Dropout2 is just like normal dropout, except it supports schedules on the dropout rates. +class Dropout2(nn.Module): + + def __init__(self, p: FloatLike): + super().__init__() + self.p = p + + def forward(self, x: Tensor) -> Tensor: + return torch.nn.functional.dropout( + x, p=float(self.p), training=self.training) + + +class SwooshLFunction(torch.autograd.Function): + """ + swoosh_l(x) = log(1 + exp(x-4)) - 0.08*x - 0.035 + """ + + @staticmethod + def forward(ctx, x: Tensor) -> Tensor: + requires_grad = x.requires_grad + if x.dtype == torch.float16: + x = x.to(torch.float32) + + zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) + + coeff = -0.08 + + with (torch.cuda.amp.autocast(enabled=False)): + with torch.enable_grad(): + x = x.detach() + x.requires_grad = True + y = torch.logaddexp(zero, x - 4.0) + coeff * x - 0.035 + + if not requires_grad: + return y + + y.backward(gradient=torch.ones_like(y)) + + grad = x.grad + floor = coeff + ceil = 1.0 + coeff + 0.005 + _diff = (grad - floor) * (255.0 / (ceil - floor)) + d_scaled = _diff + torch.rand_like(grad) + if __name__ == '__main__': + # for self-testing only. + assert d_scaled.min() >= 0.0 + assert d_scaled.max() < 256.0 + + d_int = d_scaled.to(torch.uint8) + ctx.save_for_backward(d_int) + if x.dtype == torch.float16 or torch.is_autocast_enabled(): + y = y.to(torch.float16) + return y + + @staticmethod + def backward(ctx, y_grad: Tensor) -> Tensor: + (d, ) = ctx.saved_tensors + # the same constants as used in forward pass. + + coeff = -0.08 + floor = coeff + ceil = 1.0 + coeff + 0.005 + d = d * ((ceil - floor) / 255.0) + floor + return y_grad * d + + +class SwooshL(torch.nn.Module): + + def forward(self, x: Tensor) -> Tensor: + """Return Swoosh-L activation.""" + if torch.jit.is_scripting() or torch.jit.is_tracing(): + zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) + return logaddexp(zero, x - 4.0) - 0.08 * x - 0.035 + # if not x.requires_grad: + # return k2.swoosh_l_forward(x) + # else: + # return k2.swoosh_l(x) + return SwooshLFunction.apply(x) + + +class SwooshLOnnx(torch.nn.Module): + + def forward(self, x: Tensor) -> Tensor: + """Return Swoosh-L activation.""" + zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) + return logaddexp_onnx(zero, x - 4.0) - 0.08 * x - 0.035 + + +class SwooshRFunction(torch.autograd.Function): + """ + swoosh_r(x) = log(1 + exp(x-1)) - 0.08*x - 0.313261687 + + derivatives are between -0.08 and 0.92. + """ + + @staticmethod + def forward(ctx, x: Tensor) -> Tensor: + requires_grad = x.requires_grad + + if x.dtype == torch.float16: + x = x.to(torch.float32) + + zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) + + with torch.cuda.amp.autocast(enabled=False): + with torch.enable_grad(): + x = x.detach() + x.requires_grad = True + y = torch.logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687 + + if not requires_grad: + return y + y.backward(gradient=torch.ones_like(y)) + + grad = x.grad + floor = -0.08 + ceil = 0.925 + + _diff = (grad - floor) * (255.0 / (ceil - floor)) + d_scaled = _diff + torch.rand_like(grad) + if __name__ == '__main__': + # for self-testing only. + assert d_scaled.min() >= 0.0 + assert d_scaled.max() < 256.0 + + d_int = d_scaled.to(torch.uint8) + ctx.save_for_backward(d_int) + if x.dtype == torch.float16 or torch.is_autocast_enabled(): + y = y.to(torch.float16) + return y + + @staticmethod + def backward(ctx, y_grad: Tensor) -> Tensor: + (d, ) = ctx.saved_tensors + # the same constants as used in forward pass. + floor = -0.08 + ceil = 0.925 + d = d * ((ceil - floor) / 255.0) + floor + return y_grad * d + + +class SwooshR(torch.nn.Module): + + def forward(self, x: Tensor) -> Tensor: + """Return Swoosh-R activation.""" + if torch.jit.is_scripting() or torch.jit.is_tracing(): + zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) + return logaddexp(zero, x - 1.0) - 0.08 * x - 0.313261687 + # if not x.requires_grad: + # return k2.swoosh_r_forward(x) + # else: + # return k2.swoosh_r(x) + return SwooshRFunction.apply(x) + + +class SwooshROnnx(torch.nn.Module): + + def forward(self, x: Tensor) -> Tensor: + """Return Swoosh-R activation.""" + zero = torch.tensor(0.0, dtype=x.dtype, device=x.device) + return logaddexp_onnx(zero, x - 1.0) - 0.08 * x - 0.313261687 + + +# simple version of SwooshL that does not redefine the backprop, used in +# ActivationDropoutAndLinearFunction. +def SwooshLForward(x: Tensor): + x_offset = x - 4.0 + log_sum = (1.0 + x_offset.exp()).log().to(x.dtype) + log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum) + return log_sum - 0.08 * x - 0.035 + + +def SwooshLForwardAndDeriv(x: Tensor): + """ + https://k2-fsa.github.io/k2/python_api/api.html#swoosh-l-forward-and-deriv + :param x: + :return: + """ + x_offset = x - 4.0 + log_sum = (1.0 + x_offset.exp()).log().to(x.dtype) + log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum) + + deriv = 0.92 - 1 / (1 + x_offset.exp()) + + return log_sum - 0.08 * x - 0.035, deriv + + +# simple version of SwooshR that does not redefine the backprop, used in +# ActivationDropoutAndLinearFunction. +def SwooshRForward(x: Tensor): + x_offset = x - 1.0 + log_sum = (1.0 + x_offset.exp()).log().to(x.dtype) + log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum) + return log_sum - 0.08 * x - 0.313261687 + + +def SwooshRForwardAndDeriv(x: Tensor): + """ + https://k2-fsa.github.io/k2/python_api/api.html#swoosh-r-forward-and-deriv + :param x: + :return: + """ + x_offset = x - 1.0 + log_sum = (1.0 + x_offset.exp()).log().to(x.dtype) + log_sum = torch.where(log_sum == float('inf'), x_offset, log_sum) + + deriv = 0.92 - 1 / (1 + x_offset.exp()) + + return log_sum - 0.08 * x - 0.313261687, deriv + + +class ActivationDropoutAndLinear(torch.nn.Module): + """ + This merges an activation function followed by dropout and then a nn.Linear module; + it does so in a memory efficient way so that it only stores the input to the whole + module. If activation == SwooshL and dropout_shared_dim != None, this will be + equivalent to: + nn.Sequential(SwooshL(), + Dropout3(dropout_p, shared_dim=dropout_shared_dim), + ScaledLinear(in_channels, out_channels, bias=bias, + initial_scale=initial_scale)) + If dropout_shared_dim is None, the dropout would be equivalent to + Dropout2(dropout_p). Note: Dropout3 will be more memory efficient as the dropout + mask is smaller. + + Args: + in_channels: number of input channels, e.g. 256 + out_channels: number of output channels, e.g. 256 + bias: if true, have a bias + activation: the activation function, for now just support SwooshL. + dropout_p: the dropout probability or schedule (happens after nonlinearity). + dropout_shared_dim: the dimension, if any, across which the dropout mask is + shared (e.g. the time dimension). If None, this may be less memory + efficient if there are modules before this one that cache the input + for their backprop (e.g. Balancer or Whiten). + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + bias: bool = True, + activation: str = 'SwooshL', + dropout_p: FloatLike = 0.0, + dropout_shared_dim: Optional[int] = -1, + initial_scale: float = 1.0, + ): + super().__init__() + # create a temporary module of nn.Linear that we'll steal the + # weights and bias from + linear_module = ScaledLinear( + in_channels, out_channels, bias=bias, initial_scale=initial_scale) + + self.weight = linear_module.weight + # register_parameter properly handles making it a parameter when l.bias + # is None. I think there is some reason for doing it this way rather + # than just setting it to None but I don't know what it is, maybe + # something to do with exporting the module.. + self.register_parameter('bias', linear_module.bias) + + self.activation = activation + self.dropout_p = dropout_p + self.dropout_shared_dim = dropout_shared_dim + + def forward(self, x: Tensor): + # if torch.jit.is_scripting() or torch.jit.is_tracing(): + if torch.jit.is_scripting() or torch.jit.is_tracing() or ( + not self.training): + if self.activation == 'SwooshL': + x = SwooshLForward(x) + # x = k2.swoosh_l_forward(x) + elif self.activation == 'SwooshR': + x = SwooshRForward(x) + # x = k2.swoosh_r_forward(x) + else: + assert False, self.activation + return torch.nn.functional.linear(x, self.weight, self.bias) + + # print(f"dropout_p:{float(self.dropout_p)}") + # print(f"dropout_shared_dim:{self.dropout_shared_dim}") + # return ActivationDropoutAndLinearFunction.apply( + # x, + # self.weight, + # self.bias, + # self.activation, + # float(self.dropout_p), + # self.dropout_shared_dim, + # ) + + +def convert_num_channels(x: Tensor, num_channels: int) -> Tensor: + """ + + :param x: (b, c, t, f) + :param num_channels: + :return: x: (b, num_channels, t, f) + """ + if num_channels <= x.shape[1]: + return x[:, :num_channels, :, :] + else: + shape = list(x.shape) + shape[1] = num_channels - shape[1] + zeros = torch.zeros(shape, dtype=x.dtype, device=x.device) + return torch.cat((x, zeros), dim=1) + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + torch.set_num_threads(1) + torch.set_num_interop_threads(1) diff --git a/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py b/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py new file mode 100644 index 000000000..32bf7cb4d --- /dev/null +++ b/modelscope/models/audio/ans/zipenhancer_layers/zipenhancer_layer.py @@ -0,0 +1,501 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Alibaba, Inc. and its affiliates. + +import copy +from typing import List, Optional, Tuple, Union + +import torch +from torch import Tensor, nn + +from .scaling import FloatLike, ScheduledFloat, convert_num_channels +from .zipformer import (BypassModule, CompactRelPositionalEncoding, + SimpleDownsample, SimpleUpsample, + Zipformer2EncoderLayer) + + +class DualPathZipformer2Encoder(nn.Module): + r"""DualPathZipformer2Encoder is a stack of N encoder layers + it has two kinds of EncoderLayer including F_Zipformer2EncoderLayer and T_Zipformer2EncoderLayer + the features are modeling with the shape of + [B, C, T, F] -> [F, T * B, C] -> -> [B, C, T, F] -> [T, F * B, C] -> [B, C, T, F] + + Args: + encoder_layer: an instance of the Zipformer2EncoderLayer() class (required). + num_layers: the number of sub-encoder-layers in the encoder (required). + pos_dim: the dimension for the relative positional encoding + + Examples:: + >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8) + >>> dualpath_zipformer_encoder = DualPathZipformer2Encoder(encoder_layer, num_layers=6) + >>> src = torch.rand(10, 512, 161, 101) + >>> out = dualpath_zipformer_encoder(src) + """ + + def __init__( + self, + encoder_layer: nn.Module, + num_layers: int, + pos_dim: int, + dropout: float, + warmup_begin: float, + warmup_end: float, + initial_layerdrop_rate: float = 0.5, + final_layerdrop_rate: float = 0.05, + bypass_layer=None, + ) -> None: + """ + Initialize the DualPathZipformer2Encoder module with the specified + encoder layer, number of layers, positional dimension, dropout rate, warmup period, and layer drop rates. + """ + super().__init__() + self.encoder_pos = CompactRelPositionalEncoding( + pos_dim, dropout_rate=0.15, length_factor=1.0) + + self.f_layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for i in range(num_layers)]) + self.t_layers = nn.ModuleList( + [copy.deepcopy(encoder_layer) for i in range(num_layers)]) + self.bypass_layers = nn.ModuleList( + [bypass_layer for i in range(num_layers * 2)]) + self.num_layers = num_layers + + assert 0 <= warmup_begin <= warmup_end, (warmup_begin, warmup_end) + + delta = (1.0 / num_layers) * (warmup_end - warmup_begin) + cur_begin = warmup_begin # interpreted as a training batch index + for i in range(num_layers): + cur_end = cur_begin + delta + self.f_layers[i].bypass.skip_rate = ScheduledFloat( + (cur_begin, initial_layerdrop_rate), + (cur_end, final_layerdrop_rate), + default=0.0, + ) + self.t_layers[i].bypass.skip_rate = ScheduledFloat( + (cur_begin, initial_layerdrop_rate), + (cur_end, final_layerdrop_rate), + default=0.0, + ) + cur_begin = cur_end + + def forward( + self, + src: Tensor, + chunk_size: int = -1, + feature_mask: Union[Tensor, float] = 1.0, + attn_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + r"""Pass the input through the encoder layers in a dual-path manner, processing both temporal and frequency dimensions. + + Args: + src: the dual-path sequence to the encoder (required): + shape (batch_size, embedding_dim, seq_len, frequency_len). + chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking. No used. + feature_mask: something that broadcasts with src, that we'll multiply `src` + by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim) + attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len), + interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len). + True means masked position. May be None. + src_key_padding_mask: the mask for padding, of shape (batch_size, seq_len); True means + masked position. May be None. + + Returns: a Tensor with the same shape as src. + """ + + # src: (b, c, t, f) + b, c, t, f = src.size() + src_f = src.permute(3, 0, 2, 1).contiguous().view(f, b * t, c) + src_t = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c) + pos_emb_f = self.encoder_pos(src_f) + pos_emb_t = self.encoder_pos(src_t) + + output = src + + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + output = output * feature_mask + + for i in range(len(self.f_layers)): + # output_org = output + # (b, c, t, f) + output_f_org = output.permute(3, 2, 0, + 1).contiguous() # (f, t, b, c) + output_f = output_f_org.view(f, t * b, c) + # (f, t * b, c) + output_f = self.f_layers[i]( + output_f, + pos_emb_f, + # chunk_size=chunk_size, + # attn_mask=attn_mask, + src_key_padding_mask=src_key_padding_mask, + ) + output_f = output_f.view(f, t, b, c) + output_f = self.bypass_layers[i * 2](output_f_org, output_f) + + # (f, t, b, c) + output = output_f.permute(2, 3, 1, 0).contiguous() + # (b, c, t, f) + # output = self.bypass_layers[i * 2](output_org, output) + + # output_org = output + + output_t_org = output.permute(2, 3, 0, + 1).contiguous() # (t, f, b, c) + output_t = output_t_org.view(t, f * b, c) + output_t = self.t_layers[i]( + output_t, + pos_emb_t, + # chunk_size=chunk_size, + # attn_mask=attn_mask, + src_key_padding_mask=src_key_padding_mask, + ) + output_t = output_t.view(t, f, b, c) + output_t = self.bypass_layers[i * 2 + 1](output_t_org, output_t) + # (t, f, b, c) + + output = output_t.permute(2, 3, 0, 1).contiguous() + # (b, c, t, f) + # output = self.bypass_layers[i * 2 + 1](output_org, output) + + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + output = output * feature_mask + + return output + + +class DualPathDownsampledZipformer2Encoder(nn.Module): + r""" + DualPathDownsampledZipformer2Encoder is a dual-path zipformer encoder evaluated at a reduced frame rate, + after convolutional downsampling, and then upsampled again at the output, and combined + with the origin input, so that the output has the same shape as the input. + The features are downsampled-upsampled at the time and frequency domain. + + """ + + def __init__(self, encoder: nn.Module, dim: int, t_downsample: int, + f_downsample: int, dropout: FloatLike): + """ + Initialize the DualPathDownsampledZipformer2Encoder module with the specified + encoder, dimension, temporal and frequency downsampling factors r, and dropout rate. + """ + super(DualPathDownsampledZipformer2Encoder, self).__init__() + self.downsample_factor = t_downsample + self.t_downsample_factor = t_downsample + self.f_downsample_factor = f_downsample + + if self.t_downsample_factor != 1: + self.downsample_t = SimpleDownsample(dim, t_downsample, dropout) + self.upsample_t = SimpleUpsample(dim, t_downsample) + if self.f_downsample_factor != 1: + self.downsample_f = SimpleDownsample(dim, f_downsample, dropout) + self.upsample_f = SimpleUpsample(dim, f_downsample) + + # self.num_layers = encoder.num_layers + self.encoder = encoder + + self.out_combiner = BypassModule(dim, straight_through_rate=0) + + def forward( + self, + src: Tensor, + chunk_size: int = -1, + feature_mask: Union[Tensor, float] = 1.0, + attn_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + r"""Downsample the input, process through the encoder, and then upsample back to the original shape. + + Args: + src: the sequence to the encoder (required): shape (batch_size, embedding_dim, seq_len, frequency_len). + feature_mask: 1.0 + attn_mask: None + src_key_padding_mask: None. + + Returns: a Tensor with the same shape as src. (batch_size, embedding_dim, seq_len, frequency_len) + """ + # src: (b, c, t, f) + b, c, t, f = src.size() + # print(src.size()) + + src_orig = src.permute(2, 3, 0, 1) # (t, f, b, c) + + # (b, c, t, f) + src = src.permute(2, 0, 3, 1).contiguous().view(t, b * f, c) + # -> (t, b * f, c) + if self.t_downsample_factor != 1: + src = self.downsample_t(src) + # (t//ds + 1, b * f, c) + downsample_t = src.size(0) + src = src.view(downsample_t, b, f, + c).permute(2, 1, 0, + 3).contiguous().view(f, b * downsample_t, c) + # src = self.upsample_f(src) + if self.f_downsample_factor != 1: + src = self.downsample_f(src) + # (f//ds + 1, b * downsample_t, c) + downsample_f = src.size(0) + src = src.view(downsample_f, b, downsample_t, c).permute(1, 3, 2, 0) + # (b, c, downsample_t, downsample_f) + # print(src.size()) + + # ds = self.downsample_factor + # if attn_mask is not None: + # attn_mask = attn_mask[::ds, ::ds] + + src = self.encoder( + src, + chunk_size=chunk_size, + feature_mask=feature_mask, + attn_mask=attn_mask, + src_key_padding_mask=src_key_padding_mask, + ) + + # (b, c, downsample_t, downsample_f) + src = src.permute(3, 0, 2, + 1).contiguous().view(downsample_f, b * downsample_t, + c) + if self.f_downsample_factor != 1: + src = self.upsample_f(src) + # (f, b * downsample_t, c) + src = src[:f].view(f, b, downsample_t, + c).permute(2, 1, 0, 3).contiguous().view( + downsample_t, b * f, c) + # (downsample_t, b * f, c) + if self.t_downsample_factor != 1: + src = self.upsample_t(src) + # (t, b * f, c) + src = src[:t].view(t, b, f, c).permute(0, 2, 1, 3).contiguous() + # (t, f, b, c) + out = self.out_combiner(src_orig, src) + # (t, f, b, c) + + out = out.permute(2, 3, 0, 1).contiguous() + # (b, c, t, f) + # print(out.size()) + + # remove any extra frames that are not a multiple of downsample_factor + # src = src[: src_orig.shape[0]] # slice here + + return out + + +class Zipformer2DualPathEncoder(nn.Module): + + def __init__( + self, + output_downsampling_factor: int = 2, + downsampling_factor: Tuple[int] = (2, 4), + f_downsampling_factor: Tuple[int] = None, + encoder_dim: Union[int, Tuple[int]] = 384, + num_encoder_layers: Union[int, Tuple[int]] = 4, + encoder_unmasked_dim: Union[int, Tuple[int]] = 256, + query_head_dim: Union[int, Tuple[int]] = 24, + pos_head_dim: Union[int, Tuple[int]] = 4, + value_head_dim: Union[int, Tuple[int]] = 12, + num_heads: Union[int, Tuple[int]] = 8, + feedforward_dim: Union[int, Tuple[int]] = 1536, + cnn_module_kernel: Union[int, Tuple[int]] = 31, + pos_dim: int = 192, + dropout: FloatLike = None, # see code below for default + warmup_batches: float = 4000.0, + causal: bool = False, + chunk_size: Tuple[int] = [-1], + left_context_frames: Tuple[int] = [-1], + ): + """ + Initialize the Zipformer2DualPathEncoder module. + Zipformer2DualPathEncoder processes the hidden features of the noisy speech using dual-path modeling. + It has two kinds of blocks: DualPathZipformer2Encoder and DualPathDownsampledZipformer2Encoder. + DualPathZipformer2Encoder processes the 4D features with the shape of [B, C, T, F]. + DualPathDownsampledZipformer2Encoder first downsamples the hidden features + and processes features using dual-path modeling like DualPathZipformer2Encoder. + + Args: + Various hyperparameters and settings for the encoder. + """ + super(Zipformer2DualPathEncoder, self).__init__() + + if dropout is None: + dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1)) + + def _to_tuple(x): + """Converts a single int or a 1-tuple of an int to a tuple with the same length + as downsampling_factor""" + if isinstance(x, int): + x = (x, ) + if len(x) == 1: + x = x * len(downsampling_factor) + else: + assert len(x) == len(downsampling_factor) and isinstance( + x[0], int) + return x + + self.output_downsampling_factor = output_downsampling_factor # int + self.downsampling_factor = downsampling_factor # tuple + + if f_downsampling_factor is None: + f_downsampling_factor = downsampling_factor + self.f_downsampling_factor = _to_tuple(f_downsampling_factor) + + self.encoder_dim = encoder_dim = _to_tuple(encoder_dim) # tuple + self.encoder_unmasked_dim = encoder_unmasked_dim = _to_tuple( + encoder_unmasked_dim) # tuple + num_encoder_layers = _to_tuple(num_encoder_layers) + self.num_encoder_layers = num_encoder_layers + self.query_head_dim = query_head_dim = _to_tuple(query_head_dim) + self.value_head_dim = value_head_dim = _to_tuple(value_head_dim) + pos_head_dim = _to_tuple(pos_head_dim) + self.num_heads = num_heads = _to_tuple(num_heads) + feedforward_dim = _to_tuple(feedforward_dim) + self.cnn_module_kernel = cnn_module_kernel = _to_tuple( + cnn_module_kernel) + + self.causal = causal + self.chunk_size = chunk_size + self.left_context_frames = left_context_frames + + for u, d in zip(encoder_unmasked_dim, encoder_dim): + assert u <= d + + # each one will be Zipformer2Encoder or DownsampledZipformer2Encoder + encoders = [] + + num_encoders = len(downsampling_factor) + # "1,2,4,8,4,2", + + for i in range(num_encoders): + encoder_layer = Zipformer2EncoderLayer( + embed_dim=encoder_dim[i], + pos_dim=pos_dim, + num_heads=num_heads[i], + query_head_dim=query_head_dim[i], + pos_head_dim=pos_head_dim[i], + value_head_dim=value_head_dim[i], + feedforward_dim=feedforward_dim[i], + dropout=dropout, + cnn_module_kernel=cnn_module_kernel[i], + causal=causal, + ) + + # For the segment of the warmup period, we let the Conv2dSubsampling + # layer learn something. Then we start to warm up the other encoders. + encoder = DualPathZipformer2Encoder( + encoder_layer, + num_encoder_layers[i], + pos_dim=pos_dim, + dropout=dropout, + warmup_begin=warmup_batches * (i + 1) / (num_encoders + 1), + warmup_end=warmup_batches * (i + 2) / (num_encoders + 1), + final_layerdrop_rate=0.035 * (downsampling_factor[i]**0.5), + bypass_layer=BypassModule( + encoder_dim[i], straight_through_rate=0), + ) + + if downsampling_factor[i] != 1 or f_downsampling_factor[i] != 1: + encoder = DualPathDownsampledZipformer2Encoder( + encoder, + dim=encoder_dim[i], + t_downsample=downsampling_factor[i], + f_downsample=f_downsampling_factor[i], + dropout=dropout, + ) + + encoders.append(encoder) + + self.encoders = nn.ModuleList(encoders) + + self.downsample_output = SimpleDownsample( + max(encoder_dim), + downsample=output_downsampling_factor, + dropout=dropout) + + def forward(self, x): + """ + Forward pass of the Zipformer2DualPathEncoder module. + + Args: + x (Tensor): Input tensor of shape [B, C, T, F]. + + Returns: + Tensor: Output tensor after passing through the encoder. + """ + outputs = [] + + # if torch.jit.is_scripting() or torch.jit.is_tracing(): + # feature_masks = [1.0] * len(self.encoder_dim) + # else: + # feature_masks = self.get_feature_masks(x) + feature_masks = [1.0] * len(self.encoder_dim) + attn_mask = None + + chunk_size = -1 + # left_context_chunks = -1 + + for i, module in enumerate(self.encoders): + + x = convert_num_channels(x, self.encoder_dim[i]) + + x = module( + x, + chunk_size=chunk_size, + feature_mask=feature_masks[i], + src_key_padding_mask=None, + attn_mask=attn_mask, + ) + outputs.append(x) + + # (b, c, t, f) + return x + + +if __name__ == '__main__': + + # {2,2,2,2,2,2} {192,256,256,256,256,256} {512,768,768,768,768,768} + downsampling_factor = (1, 2, 4, 3) # + encoder_dim = (16, 32, 64, 64) + pos_dim = 48 # zipformer base设置 + num_heads = (4, 4, 4, 4) # "4,4,4,8,4,4" + query_head_dim = (16, ) * len(downsampling_factor) # 32 + pos_head_dim = (4, ) * len(downsampling_factor) # 4 + value_head_dim = (12, ) * len(downsampling_factor) # 12 + feedforward_dim = (32, 64, 128, 128) # + dropout = ScheduledFloat((0.0, 0.3), (20000.0, 0.1)) + cnn_module_kernel = (15, ) * len(downsampling_factor) # 31,31,15,15,15,31 + causal = False + encoder_unmasked_dim = (16, ) * len(downsampling_factor) + + num_encoder_layers = (1, 1, 1, 1) + warmup_batches = 4000.0 + + net = Zipformer2DualPathEncoder( + output_downsampling_factor=1, + downsampling_factor=downsampling_factor, + num_encoder_layers=num_encoder_layers, + encoder_dim=encoder_dim, + encoder_unmasked_dim=encoder_unmasked_dim, + query_head_dim=query_head_dim, + pos_head_dim=pos_head_dim, + value_head_dim=value_head_dim, + pos_dim=pos_dim, + num_heads=num_heads, + feedforward_dim=feedforward_dim, + cnn_module_kernel=cnn_module_kernel, + dropout=ScheduledFloat((0.0, 0.3), (20000.0, 0.1)), + warmup_batches=warmup_batches, + causal=causal, + ) + + # net = DownsampledZipformer2Encoder( + # None, 128, 2, 0. + # ) + # x = torch.randn((101, 2, 128)) + b = 4 + t = 321 + f = 101 + c = 64 + + # x = torch.randn((101, 2, 128)) + x = torch.randn((b, c, t, f)) + + x = net(x) + print(x.size()) diff --git a/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py b/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py new file mode 100644 index 000000000..42ad6df1d --- /dev/null +++ b/modelscope/models/audio/ans/zipenhancer_layers/zipformer.py @@ -0,0 +1,1084 @@ +#!/usr/bin/env python3 +# Copyright 2022-2023 Xiaomi Corp. (authors: Daniel Povey, +# Zengwei Yao) +# Copyright (c) 2024 Alibaba, Inc. and its affiliates. +# +# See ../../../../LICENSE for clarification regarding multiple authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import logging +import math +import random +import warnings +from typing import List, Optional, Tuple, Union + +import torch +from torch import Tensor, nn + +from .scaling import \ + Identity # more friendly to backward hooks than nn.Identity(), for diagnostic reasons. +from .scaling import \ + ScaledLinear # not as in other dirs.. just scales down initial parameter values. +from .scaling import (ActivationDropoutAndLinear, BiasNorm, + ChunkCausalDepthwiseConv1d, Dropout2, FloatLike, + ScheduledFloat, limit_param_value, + penalize_abs_values_gt, softmax) + + +class Zipformer2EncoderLayer(nn.Module): + """ + Args: + embed_dim: the number of expected features in the input (required). + nhead: the number of heads in the multiheadattention models (required). + feedforward_dim: the dimension of the feedforward network model (required). + dropout: the dropout value (default=0.1). + cnn_module_kernel (int): Kernel size of convolution module (default=31). + + Examples:: + >>> encoder_layer = Zipformer2EncoderLayer(embed_dim=512, nhead=8) + >>> src = torch.rand(10, 32, 512) + >>> pos_emb = torch.rand(32, 19, 512) + >>> out = encoder_layer(src, pos_emb) + """ + + def __init__( + self, + embed_dim: int, + pos_dim: int, + num_heads: int, + query_head_dim: int, + pos_head_dim: int, + value_head_dim: int, + feedforward_dim: int, + dropout: FloatLike = 0.1, + cnn_module_kernel: int = 31, + causal: bool = False, + attention_skip_rate: FloatLike = ScheduledFloat( + (0.0, 0.2), (4000.0, 0.05), (16000, 0.0), default=0), + conv_skip_rate: FloatLike = ScheduledFloat((0.0, 0.2), (4000.0, 0.05), + (16000, 0.0), + default=0), + const_attention_rate: FloatLike = ScheduledFloat((0.0, 0.25), + (4000.0, 0.025), + default=0), + ff2_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01), + (50000.0, 0.0)), + ff3_skip_rate: FloatLike = ScheduledFloat((0.0, 0.1), (4000.0, 0.01), + (50000.0, 0.0)), + bypass_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), + (4000.0, 0.02), + default=0), + ) -> None: + super(Zipformer2EncoderLayer, self).__init__() + self.embed_dim = embed_dim + + # self.bypass implements layer skipping as well as bypass; see its default values. + self.bypass = BypassModule( + embed_dim, skip_rate=bypass_skip_rate, straight_through_rate=0) + # bypass_mid is bypass used in the middle of the layer. + self.bypass_mid = BypassModule(embed_dim, straight_through_rate=0) + + # skip probability for dynamic modules (meaning: anything but feedforward). + self.attention_skip_rate = copy.deepcopy(attention_skip_rate) + # an additional skip probability that applies to ConvModule to stop it from + # contributing too much early on. + self.conv_skip_rate = copy.deepcopy(conv_skip_rate) + + # ff2_skip_rate is to prevent the ff2 module from having output that's too big + # compared to its residual. + self.ff2_skip_rate = copy.deepcopy(ff2_skip_rate) + self.ff3_skip_rate = copy.deepcopy(ff3_skip_rate) + + self.const_attention_rate = copy.deepcopy(const_attention_rate) + + self.self_attn_weights = RelPositionMultiheadAttentionWeights( + embed_dim, + pos_dim=pos_dim, + num_heads=num_heads, + query_head_dim=query_head_dim, + pos_head_dim=pos_head_dim, + dropout=0.0, + ) + + self.self_attn1 = SelfAttention(embed_dim, num_heads, value_head_dim) + + self.self_attn2 = SelfAttention(embed_dim, num_heads, value_head_dim) + + self.feed_forward1 = FeedforwardModule(embed_dim, + (feedforward_dim * 3) // 4, + dropout) + + self.feed_forward2 = FeedforwardModule(embed_dim, feedforward_dim, + dropout) + + self.feed_forward3 = FeedforwardModule(embed_dim, + (feedforward_dim * 5) // 4, + dropout) + + self.nonlin_attention = NonlinAttention( + embed_dim, hidden_channels=3 * embed_dim // 4) + + self.conv_module1 = ConvolutionModule( + embed_dim, cnn_module_kernel, causal=causal) + + self.conv_module2 = ConvolutionModule( + embed_dim, cnn_module_kernel, causal=causal) + + # TODO: remove it + self.bypass_scale = nn.Parameter(torch.full((embed_dim, ), 0.5)) + + self.norm = BiasNorm(embed_dim) + + self.balancer1 = Identity() + self.balancer_na = Identity() + self.balancer_ff2 = Identity() + self.balancer_ff3 = Identity() + self.whiten = Identity() + self.balancer2 = Identity() + + def get_sequence_dropout_mask(self, x: Tensor, + dropout_rate: float) -> Optional[Tensor]: + if (dropout_rate == 0.0 or not self.training + or torch.jit.is_scripting() or torch.jit.is_tracing()): + return None + batch_size = x.shape[1] + mask = (torch.rand(batch_size, 1, device=x.device) > dropout_rate).to( + x.dtype) + return mask + + def sequence_dropout(self, x: Tensor, dropout_rate: float) -> Tensor: + """ + Apply sequence-level dropout to x. + x shape: (seq_len, batch_size, embed_dim) + """ + dropout_mask = self.get_sequence_dropout_mask(x, dropout_rate) + if dropout_mask is None: + return x + else: + return x * dropout_mask + + def forward( + self, + src: Tensor, + pos_emb: Tensor, + chunk_size: int = -1, + attn_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + ) -> Tensor: + """ + Pass the input through the encoder layer. + Args: + src: the sequence to the encoder (required): shape (seq_len, batch_size, embedding_dim). + pos_emb: (1, 2*seq_len-1, pos_emb_dim) or (batch_size, 2*seq_len-1, pos_emb_dim) + chunk_size: the number of frames per chunk, of >= 0; if -1, no chunking. + feature_mask: something that broadcasts with src, that we'll multiply `src` + by at every layer: if a Tensor, likely of shape (seq_len, batch_size, embedding_dim) + attn_mask: the attention mask, of shape (batch_size, seq_len, seq_len) or (seq_len, seq_len), + interpreted as (batch_size, tgt_seq_len, src_seq_len) or (tgt_seq_len, src_seq_len). + True means masked position. May be None. + src_key_padding_mask: the mask for padding, of shape (batch_size, seq_len); True means + masked position. May be None. + + Returns: + A tensor which has the same shape as src + """ + src_orig = src + + # dropout rate for non-feedforward submodules + if torch.jit.is_scripting() or torch.jit.is_tracing(): + attention_skip_rate = 0.0 + else: + attention_skip_rate = ( + float(self.attention_skip_rate) if self.training else 0.0) + + # attn_weights: (num_heads, batch_size, seq_len, seq_len) + attn_weights = self.self_attn_weights( + src, + pos_emb=pos_emb, + attn_mask=attn_mask, + key_padding_mask=src_key_padding_mask, + ) + + src = src + self.feed_forward1(src) + + self_attn_dropout_mask = self.get_sequence_dropout_mask( + src, attention_skip_rate) + + selected_attn_weights = attn_weights[0:1] + if torch.jit.is_scripting() or torch.jit.is_tracing(): + pass + elif self.training and random.random() < float( + self.const_attention_rate): + # Make attention weights constant. The intention is to + # encourage these modules to do something similar to an + # averaging-over-time operation. + # only need the mask, can just use the 1st one and expand later + selected_attn_weights = selected_attn_weights[0:1] + selected_attn_weights = (selected_attn_weights > 0.0).to( + selected_attn_weights.dtype) + selected_attn_weights = selected_attn_weights * ( + 1.0 / selected_attn_weights.sum(dim=-1, keepdim=True)) + + na = self.balancer_na( + self.nonlin_attention(src, selected_attn_weights)) + + src = src + ( + na if self_attn_dropout_mask is None else na + * self_attn_dropout_mask) + + self_attn = self.self_attn1(src, attn_weights) + + src = src + ( + self_attn if self_attn_dropout_mask is None else self_attn + * self_attn_dropout_mask) + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + conv_skip_rate = 0.0 + else: + conv_skip_rate = float( + self.conv_skip_rate) if self.training else 0.0 + + src = src + self.sequence_dropout( + self.conv_module1( + src, + chunk_size=chunk_size, + src_key_padding_mask=src_key_padding_mask), + conv_skip_rate, + ) + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + ff2_skip_rate = 0.0 + else: + ff2_skip_rate = float(self.ff2_skip_rate) if self.training else 0.0 + src = src + self.sequence_dropout( + self.balancer_ff2(self.feed_forward2(src)), ff2_skip_rate) + + # bypass in the middle of the layer. + src = self.bypass_mid(src_orig, src) + + self_attn = self.self_attn2(src, attn_weights) + + src = src + ( + self_attn if self_attn_dropout_mask is None else self_attn + * self_attn_dropout_mask) + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + conv_skip_rate = 0.0 + else: + conv_skip_rate = float( + self.conv_skip_rate) if self.training else 0.0 + + src = src + self.sequence_dropout( + self.conv_module2( + src, + chunk_size=chunk_size, + src_key_padding_mask=src_key_padding_mask), + conv_skip_rate, + ) + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + ff3_skip_rate = 0.0 + else: + ff3_skip_rate = float(self.ff3_skip_rate) if self.training else 0.0 + src = src + self.sequence_dropout( + self.balancer_ff3(self.feed_forward3(src)), ff3_skip_rate) + + src = self.balancer1(src) + src = self.norm(src) + + src = self.bypass(src_orig, src) + + src = self.balancer2(src) + src = self.whiten(src) + + return src + + +class BypassModule(nn.Module): + """ + An nn.Module that implements a learnable bypass scale, and also randomized per-sequence + layer-skipping. The bypass is limited during early stages of training to be close to + "straight-through", i.e. to not do the bypass operation much initially, in order to + force all the modules to learn something. + """ + + def __init__( + self, + embed_dim: int, + skip_rate: FloatLike = 0.0, + straight_through_rate: FloatLike = 0.0, + scale_min: FloatLike = ScheduledFloat((0.0, 0.9), (20000.0, 0.2), + default=0), + scale_max: FloatLike = 1.0, + ): + super().__init__() + self.bypass_scale = nn.Parameter(torch.full((embed_dim, ), 0.5)) + self.skip_rate = copy.deepcopy(skip_rate) + self.straight_through_rate = copy.deepcopy(straight_through_rate) + self.scale_min = copy.deepcopy(scale_min) + self.scale_max = copy.deepcopy(scale_max) + + def _get_bypass_scale(self, batch_size: int): + # returns bypass-scale of shape (num_channels,), + # or (batch_size, num_channels,). This is actually the + # scale on the non-residual term, so 0 corresponds to bypassing + # this module. + if torch.jit.is_scripting() or torch.jit.is_tracing( + ) or not self.training: + return self.bypass_scale + else: + ans = limit_param_value( + self.bypass_scale, + min=float(self.scale_min), + max=float(self.scale_max)) + skip_rate = float(self.skip_rate) + if skip_rate != 0.0: + mask = torch.rand( + (batch_size, 1), device=ans.device) > skip_rate + ans = ans * mask + # now ans is of shape (batch_size, num_channels), and is zero for sequences + # on which we have randomly chosen to do layer-skipping. + straight_through_rate = float(self.straight_through_rate) + if straight_through_rate != 0.0: + _rand_tensor = torch.rand((batch_size, 1), device=ans.device) + mask = (_rand_tensor < straight_through_rate) + ans = torch.maximum(ans, mask.to(ans.dtype)) + return ans + + def forward(self, src_orig: Tensor, src: Tensor): + """ + Args: src_orig and src are both of shape (seq_len, batch_size, num_channels) + Returns: something with the same shape as src and src_orig + """ + # bypass_scale = self._get_bypass_scale(src.shape[1]) + bypass_scale = self._get_bypass_scale(src.shape[-2]) + return src_orig + (src - src_orig) * bypass_scale + + +class SimpleDownsample(torch.nn.Module): + """ + Does downsampling with attention, by weighted sum, and a projection.. + """ + + def __init__(self, channels: int, downsample: int, dropout: FloatLike): + super(SimpleDownsample, self).__init__() + + self.bias = nn.Parameter(torch.zeros(downsample)) + + self.name = None # will be set from training code + self.dropout = copy.deepcopy(dropout) + + self.downsample = downsample + + def forward(self, src: Tensor) -> Tensor: + """ + x: (seq_len, batch_size, in_channels) + Returns a tensor of shape + ( (seq_len+downsample-1)//downsample, batch_size, channels) + """ + (seq_len, batch_size, in_channels) = src.shape + ds = self.downsample + d_seq_len = (seq_len + ds - 1) // ds + + # Pad to an exact multiple of self.downsample + # right-pad src, repeating the last element. + pad = d_seq_len * ds - seq_len + src_extra = src[src.shape[0] - 1:].expand(pad, src.shape[1], + src.shape[2]) + src = torch.cat((src, src_extra), dim=0) + assert src.shape[0] == d_seq_len * ds + + src = src.reshape(d_seq_len, ds, batch_size, in_channels) + + weights = self.bias.softmax(dim=0) + # weights: (downsample, 1, 1) + weights = weights.unsqueeze(-1).unsqueeze(-1) + + # ans1 is the first `in_channels` channels of the output + ans = (src * weights).sum(dim=1) + + return ans + + +class SimpleUpsample(torch.nn.Module): + """ + A very simple form of upsampling that mostly just repeats the input, but + also adds a position-specific bias. + """ + + def __init__(self, num_channels: int, upsample: int): + super(SimpleUpsample, self).__init__() + self.upsample = upsample + + def forward(self, src: Tensor) -> Tensor: + """ + x: (seq_len, batch_size, num_channels) + Returns a tensor of shape + ( (seq_len*upsample), batch_size, num_channels) + """ + upsample = self.upsample + (seq_len, batch_size, num_channels) = src.shape + src = src.unsqueeze(1).expand(seq_len, upsample, batch_size, + num_channels) + src = src.reshape(seq_len * upsample, batch_size, num_channels) + return src + + +class CompactRelPositionalEncoding(torch.nn.Module): + """ + Relative positional encoding module. This version is "compact" meaning it is able to encode + the important information about the relative position in a relatively small number of dimensions. + The goal is to make it so that small differences between large relative offsets (e.g. 1000 vs. 1001) + make very little difference to the embedding. Such differences were potentially important + when encoding absolute position, but not important when encoding relative position because there + is now no need to compare two large offsets with each other. + + Our embedding works by projecting the interval [-infinity,infinity] to a finite interval + using the atan() function, before doing the Fourier transform of that fixed interval. The + atan() function would compress the "long tails" too small, + making it hard to distinguish between different magnitudes of large offsets, so we use a logarithmic + function to compress large offsets to a smaller range before applying atan(). + Scalings are chosen in such a way that the embedding can clearly distinguish individual offsets as long + as they are quite close to the origin, e.g. abs(offset) <= about sqrt(embedding_dim) + + + Args: + embed_dim: Embedding dimension. + dropout_rate: Dropout rate. + max_len: Maximum input length: just a heuristic for initialization. + length_factor: a heuristic scale (should be >= 1.0) which, if larger, gives + less weight to small differences of offset near the origin. + """ + + def __init__( + self, + embed_dim: int, + dropout_rate: FloatLike, + max_len: int = 1000, + length_factor: float = 1.0, + ) -> None: + """Construct a CompactRelPositionalEncoding object.""" + super(CompactRelPositionalEncoding, self).__init__() + self.embed_dim = embed_dim + assert embed_dim % 2 == 0, embed_dim + self.dropout = Dropout2(dropout_rate) + self.pe = None + assert length_factor >= 1.0, length_factor + self.length_factor = length_factor + self.extend_pe(torch.tensor(0.0).expand(max_len)) + + def extend_pe(self, x: Tensor, left_context_len: int = 0) -> None: + """Reset the positional encodings.""" + T = x.size(0) + left_context_len + + if self.pe is not None: + # self.pe contains both positive and negative parts + # the length of self.pe is 2 * input_len - 1 + if self.pe.size(0) >= T * 2 - 1: + self.pe = self.pe.to(dtype=x.dtype, device=x.device) + return + + # if T == 4, x would contain [ -3, -2, 1, 0, 1, 2, 3 ] + x = torch.arange( + -(T - 1), T, device=x.device).to(torch.float32).unsqueeze(1) + + freqs = 1 + torch.arange(self.embed_dim // 2, device=x.device) + + # `compression_length` this is arbitrary/heuristic, if it is larger we have more resolution + # for small time offsets but less resolution for large time offsets. + compression_length = self.embed_dim**0.5 + # x_compressed, like X, goes from -infinity to infinity as T goes from -infinity to infinity; + # but it does so more slowly than T for large absolute values of T. + # The formula is chosen so that d(x_compressed )/dx is 1 around x == 0, which + # is important. + _tmp_tensor = ((x.abs() + compression_length).log() + - math.log(compression_length)) + x_compressed = (compression_length * x.sign() * _tmp_tensor) + + # if self.length_factor == 1.0, then length_scale is chosen so that the + # FFT can exactly separate points close to the origin (T == 0). So this + # part of the formulation is not really heuristic. + # But empirically, for ASR at least, length_factor > 1.0 seems to work better. + length_scale = self.length_factor * self.embed_dim / (2.0 * math.pi) + + # note for machine implementations: if atan is not available, we can use: + # x.sign() * ((1 / (x.abs() + 1)) - 1) * (-math.pi/2) + # check on wolframalpha.com: plot(sign(x) * (1 / ( abs(x) + 1) - 1 ) * -pi/2 , atan(x)) + x_atan = (x_compressed + / length_scale).atan() # results between -pi and pi + + cosines = (x_atan * freqs).cos() + sines = (x_atan * freqs).sin() + + pe = torch.zeros(x.shape[0], self.embed_dim, device=x.device) + pe[:, 0::2] = cosines + pe[:, 1::2] = sines + pe[:, -1] = 1.0 # for bias. + + self.pe = pe.to(dtype=x.dtype) + + def forward(self, x: Tensor, left_context_len: int = 0) -> Tensor: + """Create positional encoding. + + Args: + x (Tensor): Input tensor (time, batch, `*`). + left_context_len: (int): Length of cached left context. + + Returns: + positional embedding, of shape (batch, left_context_len + 2*time-1, `*`). + """ + self.extend_pe(x, left_context_len) + x_size_left = x.size(0) + left_context_len + # length of positive side: x.size(0) + left_context_len + # length of negative side: x.size(0) + pos_emb = self.pe[self.pe.size(0) // 2 - x_size_left + + 1:self.pe.size(0) // 2 # noqa E203 + + x.size(0), :, ] + pos_emb = pos_emb.unsqueeze(0) + return self.dropout(pos_emb) + + +class RelPositionMultiheadAttentionWeights(nn.Module): + r"""Module that computes multi-head attention weights with relative position encoding. + Various other modules consume the resulting attention weights: see, for example, the + SimpleAttention module which allows you to compute conventional attention. + + This is a quite heavily modified from: "Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context", + we have to write up the differences. + + + Args: + embed_dim: number of channels at the input to this module, e.g. 256 + pos_dim: dimension of the positional encoding vectors, e.g. 128. + num_heads: number of heads to compute weights for, e.g. 8 + query_head_dim: dimension of the query (and key), per head. e.g. 24. + pos_head_dim: dimension of the projected positional encoding per head, e.g. 4. + dropout: dropout probability for attn_output_weights. Default: 0.0. + pos_emb_skip_rate: probability for skipping the pos_emb part of the scores on + any given call to forward(), in training time. + """ + + def __init__( + self, + embed_dim: int, + pos_dim: int, + num_heads: int, + query_head_dim: int, + pos_head_dim: int, + dropout: float = 0.0, + pos_emb_skip_rate: FloatLike = ScheduledFloat((0.0, 0.5), + (4000.0, 0.0)), + ) -> None: + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.query_head_dim = query_head_dim + self.pos_head_dim = pos_head_dim + self.dropout = dropout + self.pos_emb_skip_rate = copy.deepcopy(pos_emb_skip_rate) + self.name = None # will be overwritten in training code; for diagnostics. + + key_head_dim = query_head_dim + in_proj_dim = (query_head_dim + key_head_dim + + pos_head_dim) * num_heads + + # the initial_scale is supposed to take over the "scaling" factor of + # head_dim ** -0.5 that has been used in previous forms of attention, + # dividing it between the query and key. Note: this module is intended + # to be used with the ScaledAdam optimizer; with most other optimizers, + # it would be necessary to apply the scaling factor in the forward function. + self.in_proj = ScaledLinear( + embed_dim, + in_proj_dim, + bias=True, + initial_scale=query_head_dim**-0.25) + + self.whiten_keys = Identity() + self.balance_keys = Identity() + + # linear transformation for positional encoding. + self.linear_pos = ScaledLinear( + pos_dim, num_heads * pos_head_dim, bias=False, initial_scale=0.05) + + # the following are for diagnostics only, see --print-diagnostics option + self.copy_pos_query = Identity() + self.copy_query = Identity() + + def forward( + self, + x: Tensor, + pos_emb: Tensor, + key_padding_mask: Optional[Tensor] = None, + attn_mask: Optional[Tensor] = None, + ) -> Tensor: + r""" + Args: + x: input of shape (seq_len, batch_size, embed_dim) + pos_emb: Positional embedding tensor, of shape (1, 2*seq_len - 1, pos_dim) + key_padding_mask: a bool tensor of shape (batch_size, seq_len). Positions that + are True in this mask will be ignored as sources in the attention weighting. + attn_mask: mask of shape (seq_len, seq_len) or (batch_size, seq_len, seq_len), + interpreted as ([batch_size,] tgt_seq_len, src_seq_len) + saying which positions are allowed to attend to which other positions. + Returns: + a tensor of attention weights, of shape (hum_heads, batch_size, seq_len, seq_len) + interpreted as (hum_heads, batch_size, tgt_seq_len, src_seq_len). + """ + x = self.in_proj(x) + query_head_dim = self.query_head_dim + pos_head_dim = self.pos_head_dim + num_heads = self.num_heads + + seq_len, batch_size, _ = x.shape + + query_dim = query_head_dim * num_heads + + # self-attention + q = x[..., 0:query_dim] + k = x[..., query_dim:2 * query_dim] + # p is the position-encoding query + p = x[..., 2 * query_dim:] + assert p.shape[-1] == num_heads * pos_head_dim, (p.shape[-1], + num_heads, + pos_head_dim) + + q = self.copy_query(q) # for diagnostics only, does nothing. + k = self.whiten_keys( + self.balance_keys(k)) # does nothing in the forward pass. + p = self.copy_pos_query(p) # for diagnostics only, does nothing. + + q = q.reshape(seq_len, batch_size, num_heads, query_head_dim) + p = p.reshape(seq_len, batch_size, num_heads, pos_head_dim) + k = k.reshape(seq_len, batch_size, num_heads, query_head_dim) + + # time1 refers to target, time2 refers to source. + q = q.permute(2, 1, 0, 3) # (head, batch, time1, query_head_dim) + p = p.permute(2, 1, 0, 3) # (head, batch, time1, pos_head_dim) + k = k.permute(2, 1, 3, 0) # (head, batch, d_k, time2) + + # print(f"MHSAW {q.shape} {k.shape}") + attn_scores = torch.matmul(q, k) + + use_pos_scores = False + if torch.jit.is_scripting() or torch.jit.is_tracing(): + # We can't put random.random() in the same line + use_pos_scores = True + elif not self.training or random.random() >= float( + self.pos_emb_skip_rate): + use_pos_scores = True + + if use_pos_scores: + pos_emb = self.linear_pos(pos_emb) + seq_len2 = 2 * seq_len - 1 + pos_emb = pos_emb.reshape(-1, seq_len2, num_heads, + pos_head_dim).permute(2, 0, 3, 1) + # pos shape now: (head, {1 or batch_size}, pos_dim, seq_len2) + + # (head, batch, time1, pos_dim) x (head, 1, pos_dim, seq_len2) -> (head, batch, time1, seq_len2) + # [where seq_len2 represents relative position.] + # print(f"MHSAW pos {p.shape} {pos_emb.shape}") + pos_scores = torch.matmul(p, pos_emb) + # the following .as_strided() expression converts the last axis of pos_scores from relative + # to absolute position. I don't know whether I might have got the time-offsets backwards or + # not, but let this code define which way round it is supposed to be. + if torch.jit.is_tracing(): + (num_heads, batch_size, time1, n) = pos_scores.shape + rows = torch.arange(start=time1 - 1, end=-1, step=-1) + cols = torch.arange(seq_len) + rows = rows.repeat(batch_size * num_heads).unsqueeze(-1) + indexes = rows + cols + pos_scores = pos_scores.reshape(-1, n) + pos_scores = torch.gather(pos_scores, dim=1, index=indexes) + pos_scores = pos_scores.reshape(num_heads, batch_size, time1, + seq_len) + else: + pos_scores = pos_scores.as_strided( + (num_heads, batch_size, seq_len, seq_len), + ( + pos_scores.stride(0), + pos_scores.stride(1), + pos_scores.stride(2) - pos_scores.stride(3), + pos_scores.stride(3), + ), + storage_offset=pos_scores.stride(3) * (seq_len - 1), + ) + # print(attn_scores.shape, pos_scores.shape) + if self.training: + attn_scores = attn_scores + pos_scores + else: + # inplace operator important + attn_scores.add_(pos_scores) + # attn_scores = attn_scores + pos_scores + + if torch.jit.is_scripting() or torch.jit.is_tracing(): + pass + elif self.training and random.random() < 0.1: + # This is a harder way of limiting the attention scores to not be + # too large. It incurs a penalty if any of them has an absolute + # value greater than 50.0. this should be outside the normal range + # of the attention scores. We use this mechanism instead of, say, + # something added to the loss function involving the entropy, + # because once the entropy gets very small gradients through the + # softmax can become very small, and we'd get zero derivatives. The + # choices of 1.0e-04 as the scale on the penalty makes this + # mechanism vulnerable to the absolute scale of the loss function, + # but we view this as a failsafe to avoid "implausible" parameter + # values rather than a regularization method that should be active + # under normal circumstances. + attn_scores = penalize_abs_values_gt( + attn_scores, limit=25.0, penalty=1.0e-04, name=self.name) + + assert attn_scores.shape == (num_heads, batch_size, seq_len, seq_len) + + if attn_mask is not None: + assert attn_mask.dtype == torch.bool + # use -1000 to avoid nan's where attn_mask and key_padding_mask make + # all scores zero. It's important that this be large enough that exp(-1000) + # is exactly zero, for reasons related to const_attention_rate, it + # compares the final weights with zero. + attn_scores = attn_scores.masked_fill(attn_mask, -1000) + + if key_padding_mask is not None: + assert key_padding_mask.shape == ( + batch_size, + seq_len, + ), key_padding_mask.shape + attn_scores = attn_scores.masked_fill( + key_padding_mask.unsqueeze(1), + -1000, + ) + + # We use our own version of softmax, defined in scaling.py, which should + # save a little of the memory used in backprop by, if we are in + # automatic mixed precision mode (amp / autocast), by only storing the + # half-precision output for backprop purposes. + attn_weights = softmax(attn_scores, dim=-1) + + attn_weights = nn.functional.dropout( + attn_weights, p=self.dropout, training=self.training) + + return attn_weights + + +class SelfAttention(nn.Module): + """ + The simplest possible attention module. This one works with already-computed attention + weights, e.g. as computed by RelPositionMultiheadAttentionWeights. + + Args: + embed_dim: the input and output embedding dimension + num_heads: the number of attention heads + value_head_dim: the value dimension per head + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + value_head_dim: int, + ) -> None: + super().__init__() + self.in_proj = nn.Linear( + embed_dim, num_heads * value_head_dim, bias=True) + + self.out_proj = ScaledLinear( + num_heads * value_head_dim, + embed_dim, + bias=True, + initial_scale=0.05) + + self.whiten = Identity() + + def forward( + self, + x: Tensor, + attn_weights: Tensor, + ) -> Tensor: + """ + Args: + x: input tensor, of shape (seq_len, batch_size, embed_dim) + attn_weights: a tensor of shape (num_heads, batch_size, seq_len, seq_len), + with seq_len being interpreted as (tgt_seq_len, src_seq_len). Expect + attn_weights.sum(dim=-1) == 1. + Returns: + a tensor with the same shape as x. + """ + (seq_len, batch_size, embed_dim) = x.shape + num_heads = attn_weights.shape[0] + assert attn_weights.shape == (num_heads, batch_size, seq_len, seq_len) + + x = self.in_proj( + x) # (seq_len, batch_size, num_heads * value_head_dim) + x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3) + # now x: (num_heads, batch_size, seq_len, value_head_dim) + value_head_dim = x.shape[-1] + + # todo: see whether there is benefit in overriding matmul + # print(f"SelfAttetion pos {attn_weights.shape} {x.shape}") + x = torch.matmul(attn_weights, x) + # v: (num_heads, batch_size, seq_len, value_head_dim) + + x = ( + x.permute(2, 1, 0, + 3).contiguous().view(seq_len, batch_size, + num_heads * value_head_dim)) + + # returned value is of shape (seq_len, batch_size, embed_dim), like the input. + x = self.out_proj(x) + x = self.whiten(x) + + return x + + +class FeedforwardModule(nn.Module): + """Feedforward module in Zipformer2 model.""" + + def __init__(self, embed_dim: int, feedforward_dim: int, + dropout: FloatLike): + super(FeedforwardModule, self).__init__() + self.in_proj = nn.Linear(embed_dim, feedforward_dim) + + self.hidden_balancer = Identity() + + # shared_dim=0 means we share the dropout mask along the time axis + self.out_proj = ActivationDropoutAndLinear( + feedforward_dim, + embed_dim, + activation='SwooshL', + dropout_p=dropout, + dropout_shared_dim=0, + bias=True, + initial_scale=0.1, + ) + + self.out_whiten = Identity() + + def forward(self, x: Tensor): + x = self.in_proj(x) + x = self.hidden_balancer(x) + # out_proj contains SwooshL activation, then dropout, then linear. + x = self.out_proj(x) + x = self.out_whiten(x) + return x + + +class NonlinAttention(nn.Module): + """This is like the ConvolutionModule, but refactored so that we use multiplication by attention weights (borrowed + from the attention module) in place of actual convolution. We also took out the second nonlinearity, the + one after the attention mechanism. + + Args: + channels (int): The number of channels of conv layers. + """ + + def __init__( + self, + channels: int, + hidden_channels: int, + ) -> None: + super().__init__() + + self.hidden_channels = hidden_channels + + self.in_proj = nn.Linear(channels, hidden_channels * 3, bias=True) + + self.balancer = Identity() + self.tanh = nn.Tanh() + + self.identity1 = Identity() # for diagnostics. + self.identity2 = Identity() # for diagnostics. + self.identity3 = Identity() # for diagnostics. + + self.out_proj = ScaledLinear( + hidden_channels, channels, bias=True, initial_scale=0.05) + + self.whiten1 = Identity() + self.whiten2 = Identity() + + def forward( + self, + x: Tensor, + attn_weights: Tensor, + ) -> Tensor: + """. + Args: + x: a Tensor of shape (seq_len, batch_size, num_channels) + attn_weights: a Tensor of shape (num_heads, batch_size, seq_len, seq_len) + Returns: + a Tensor with the same shape as x + """ + x = self.in_proj(x) + + (seq_len, batch_size, _) = x.shape + hidden_channels = self.hidden_channels + + s, x, y = x.chunk(3, dim=2) + + # s will go through tanh. + + s = self.balancer(s) + s = self.tanh(s) + + s = s.unsqueeze(-1).reshape(seq_len, batch_size, hidden_channels) + x = self.whiten1(x) + x = x * s + x = self.identity1(x) # diagnostics only, it's the identity. + + (seq_len, batch_size, embed_dim) = x.shape + num_heads = attn_weights.shape[0] + assert attn_weights.shape == (num_heads, batch_size, seq_len, seq_len) + + x = x.reshape(seq_len, batch_size, num_heads, -1).permute(2, 1, 0, 3) + # now x: (num_heads, batch_size, seq_len, head_dim) + # print(f"nonlinattion {attn_weights.shape} {x.shape}") + x = torch.matmul(attn_weights, x) + # now x: (num_heads, batch_size, seq_len, head_dim) + x = x.permute(2, 1, 0, 3).reshape(seq_len, batch_size, -1) + + y = self.identity2(y) + x = x * y + x = self.identity3(x) + + x = self.out_proj(x) + x = self.whiten2(x) + return x + + +class ConvolutionModule(nn.Module): + """ConvolutionModule in Zipformer2 model. + Modified from https://github.com/espnet/espnet/blob/master/espnet/nets/pytorch_backend/zipformer/convolution.py + + Args: + channels (int): The number of channels of conv layers. + kernel_size (int): Kernerl size of conv layers. + bias (bool): Whether to use bias in conv layers (default=True). + + """ + + def __init__( + self, + channels: int, + kernel_size: int, + causal: bool, + ) -> None: + """Construct a ConvolutionModule object.""" + super(ConvolutionModule, self).__init__() + # kernerl_size should be a odd number for 'SAME' padding + assert (kernel_size - 1) % 2 == 0 + + bottleneck_dim = channels + self.causal = causal + + self.in_proj = nn.Linear( + channels, + 2 * bottleneck_dim, + ) + # the gradients on in_proj are a little noisy, likely to do with the + # sigmoid in glu. + + self.balancer1 = Identity() + + self.activation1 = Identity() # for diagnostics + + self.sigmoid = nn.Sigmoid() + + self.activation2 = Identity() # for diagnostics + + assert kernel_size % 2 == 1 + + self.depthwise_conv = ( + ChunkCausalDepthwiseConv1d( + channels=bottleneck_dim, kernel_size=kernel_size) + if causal else nn.Conv1d( + in_channels=bottleneck_dim, + out_channels=bottleneck_dim, + groups=bottleneck_dim, + kernel_size=kernel_size, + padding=kernel_size // 2, + )) + + self.balancer2 = Identity() + + self.whiten = Identity() + + self.out_proj = ActivationDropoutAndLinear( + bottleneck_dim, + channels, + activation='SwooshR', + dropout_p=0.0, + initial_scale=0.05, + ) + + def forward( + self, + x: Tensor, + src_key_padding_mask: Optional[Tensor] = None, + chunk_size: int = -1, + ) -> Tensor: + """Compute convolution module. + + Args: + x: Input tensor (#time, batch, channels). + src_key_padding_mask: the mask for the src keys per batch (optional): + (batch, #time), contains True in masked positions. + + Returns: + Tensor: Output tensor (#time, batch, channels). + + """ + + x = self.in_proj(x) # (time, batch, 2*channels) + + x, s = x.chunk(2, dim=2) + s = self.balancer1(s) + s = self.sigmoid(s) + x = self.activation1(x) # identity. + x = x * s + x = self.activation2(x) # identity + + # (time, batch, channels) + + # exchange the temporal dimension and the feature dimension + x = x.permute(1, 2, 0) # (#batch, channels, time). + + if src_key_padding_mask is not None: + x = x.masked_fill( + src_key_padding_mask.unsqueeze(1).expand_as(x), 0.0) + + if (not torch.jit.is_scripting() and not torch.jit.is_tracing() + and chunk_size >= 0): + # Not support exporting a model for simulated streaming decoding + assert ( + self.causal + ), 'Must initialize model with causal=True if you use chunk_size' + x = self.depthwise_conv(x, chunk_size=chunk_size) + else: + # with record_function("depthwise_conv"): + x = self.depthwise_conv(x) + # pass + + x = self.balancer2(x) + x = x.permute(2, 0, 1) # (time, batch, channels) + + x = self.whiten(x) # (time, batch, channels) + x = self.out_proj(x) # (time, batch, channels) + + return x + + +if __name__ == '__main__': + logging.getLogger().setLevel(logging.INFO) + torch.set_num_threads(1) + torch.set_num_interop_threads(1) diff --git a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py b/modelscope/models/audio/asr/generic_automatic_speech_recognition.py deleted file mode 100644 index 5e02076ee..000000000 --- a/modelscope/models/audio/asr/generic_automatic_speech_recognition.py +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import os -from typing import Any, Dict - -from modelscope.metainfo import Models -from modelscope.models.base import Model -from modelscope.models.builder import MODELS -from modelscope.utils.constant import Frameworks, Tasks - -__all__ = ['GenericAutomaticSpeechRecognition'] - - -@MODELS.register_module( - Tasks.auto_speech_recognition, module_name=Models.generic_asr) -@MODELS.register_module( - Tasks.voice_activity_detection, module_name=Models.generic_asr) -@MODELS.register_module( - Tasks.speech_separation, module_name=Models.generic_asr) -@MODELS.register_module( - Tasks.language_score_prediction, module_name=Models.generic_asr) -@MODELS.register_module(Tasks.speech_timestamp, module_name=Models.generic_asr) -class GenericAutomaticSpeechRecognition(Model): - - def __init__(self, model_dir: str, am_model_name: str, - model_config: Dict[str, Any], *args, **kwargs): - """initialize the info of model. - - Args: - model_dir (str): the model path. - am_model_name (str): the am model name from configuration.json - model_config (Dict[str, Any]): the detail config about model from configuration.json - """ - super().__init__(model_dir, am_model_name, model_config, *args, - **kwargs) - self.model_cfg = { - # the recognition model dir path - 'model_workspace': model_dir, - # the am model name - 'am_model': am_model_name, - # the am model file path - 'am_model_path': os.path.join(model_dir, am_model_name), - # the recognition model config dict - 'model_config': model_config - } - - def forward(self) -> Dict[str, Any]: - """preload model and return the info of the model - """ - - return self.model_cfg diff --git a/tests/metrics/__init__.py b/modelscope/models/audio/funasr/__init__.py similarity index 100% rename from tests/metrics/__init__.py rename to modelscope/models/audio/funasr/__init__.py diff --git a/modelscope/models/audio/funasr/model.py b/modelscope/models/audio/funasr/model.py new file mode 100644 index 000000000..73ffc6189 --- /dev/null +++ b/modelscope/models/audio/funasr/model.py @@ -0,0 +1,62 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from typing import Any, Dict + +import json +from funasr import AutoModel + +from modelscope.metainfo import Models +from modelscope.models.base import Model +from modelscope.models.builder import MODELS +from modelscope.utils.constant import Frameworks, Tasks + +__all__ = ['GenericFunASR'] + + +@MODELS.register_module( + Tasks.auto_speech_recognition, module_name=Models.funasr) +@MODELS.register_module( + Tasks.voice_activity_detection, module_name=Models.funasr) +@MODELS.register_module( + Tasks.language_score_prediction, module_name=Models.funasr) +@MODELS.register_module(Tasks.punctuation, module_name=Models.funasr) +@MODELS.register_module(Tasks.speaker_diarization, module_name=Models.funasr) +@MODELS.register_module(Tasks.speaker_verification, module_name=Models.funasr) +@MODELS.register_module(Tasks.speech_separation, module_name=Models.funasr) +@MODELS.register_module(Tasks.speech_timestamp, module_name=Models.funasr) +@MODELS.register_module(Tasks.emotion_recognition, module_name=Models.funasr) +class GenericFunASR(Model): + + def __init__(self, model_dir, *args, **kwargs): + """initialize the info of model. + + Args: + model_dir (str): the model path. + am_model_name (str): the am model name from configuration.json + model_config (Dict[str, Any]): the detail config about model from configuration.json + """ + super().__init__(model_dir, *args, **kwargs) + model_cfg = json.loads( + open(os.path.join(model_dir, 'configuration.json')).read()) + if 'vad_model' not in kwargs and 'vad_model' in model_cfg: + kwargs['vad_model'] = model_cfg['vad_model'] + kwargs['vad_model_revision'] = model_cfg.get( + 'vad_model_revision', None) + if 'punc_model' not in kwargs and 'punc_model' in model_cfg: + kwargs['punc_model'] = model_cfg['punc_model'] + kwargs['punc_model_revision'] = model_cfg.get( + 'punc_model_revision', None) + if 'spk_model' not in kwargs and 'spk_model' in model_cfg: + kwargs['spk_model'] = model_cfg['spk_model'] + kwargs['spk_model_revision'] = model_cfg.get( + 'spk_model_revision', None) + + self.model = AutoModel(model=model_dir, **kwargs) + + def forward(self, *args, **kwargs): + """preload model and return the info of the model + """ + + output = self.model.generate(*args, **kwargs) + return output diff --git a/modelscope/models/audio/quantization/__init__.py b/modelscope/models/audio/quantization/__init__.py new file mode 100644 index 000000000..4952a0765 --- /dev/null +++ b/modelscope/models/audio/quantization/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .generic_audio_quantization import GenericAudioQuantization + +else: + _import_structure = { + 'generic_audio_quantization': ['GenericAudioQuantization'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/audio/sv/generic_speaker_verification.py b/modelscope/models/audio/quantization/generic_audio_quantization.py similarity index 87% rename from modelscope/models/audio/sv/generic_speaker_verification.py rename to modelscope/models/audio/quantization/generic_audio_quantization.py index 788ccf7c7..2967cd3c2 100644 --- a/modelscope/models/audio/sv/generic_speaker_verification.py +++ b/modelscope/models/audio/quantization/generic_audio_quantization.py @@ -8,12 +8,12 @@ from modelscope.models.builder import MODELS from modelscope.utils.constant import Frameworks, Tasks +__all__ = ['GenericAudioQuantization'] + @MODELS.register_module( - Tasks.speaker_verification, module_name=Models.generic_sv) -@MODELS.register_module( - Tasks.speaker_diarization, module_name=Models.generic_sv) -class SpeakerVerification(Model): + Tasks.audio_quantization, module_name=Models.audio_quantization) +class GenericAudioQuantization(Model): def __init__(self, model_dir: str, model_name: str, model_config: Dict[str, Any], *args, **kwargs): diff --git a/modelscope/models/audio/sv/ERes2Net.py b/modelscope/models/audio/sv/ERes2Net.py index 0119783c3..0d4a81374 100644 --- a/modelscope/models/audio/sv/ERes2Net.py +++ b/modelscope/models/audio/sv/ERes2Net.py @@ -19,6 +19,7 @@ from modelscope.models import MODELS, TorchModel from modelscope.models.audio.sv.fusion import AFF from modelscope.utils.constant import Tasks +from modelscope.utils.device import create_device class ReLU(nn.Hardtanh): @@ -54,11 +55,11 @@ def conv3x3(in_planes, out_planes, stride=1): bias=False) -class BasicBlockRes2Net(nn.Module): +class BasicBlockERes2Net(nn.Module): expansion = 2 def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): - super(BasicBlockRes2Net, self).__init__() + super(BasicBlockERes2Net, self).__init__() width = int(math.floor(planes * (baseWidth / 64.0))) self.conv1 = conv1x1(in_planes, width * scale, stride) self.bn1 = nn.BatchNorm2d(width * scale) @@ -117,11 +118,11 @@ def forward(self, x): return out -class BasicBlockRes2Net_diff_AFF(nn.Module): +class BasicBlockERes2Net_AFF(nn.Module): expansion = 2 def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): - super(BasicBlockRes2Net_diff_AFF, self).__init__() + super(BasicBlockERes2Net_AFF, self).__init__() width = int(math.floor(planes * (baseWidth / 64.0))) self.conv1 = conv1x1(in_planes, width * scale, stride) self.bn1 = nn.BatchNorm2d(width * scale) @@ -189,8 +190,8 @@ def forward(self, x): class ERes2Net(nn.Module): def __init__(self, - block=BasicBlockRes2Net, - block_fuse=BasicBlockRes2Net_diff_AFF, + block=BasicBlockERes2Net, + block_fuse=BasicBlockERes2Net_AFF, num_blocks=[3, 4, 6, 3], m_channels=32, feat_dim=80, @@ -314,6 +315,7 @@ def __init__(self, model_dir, model_config: Dict[str, Any], *args, self.m_channels = self.model_config['channels'] self.other_config = kwargs self.feature_dim = 80 + self.device = create_device(self.other_config['device']) self.embedding_model = ERes2Net( embed_dim=self.embed_dim, m_channels=self.m_channels) @@ -321,6 +323,7 @@ def __init__(self, model_dir, model_config: Dict[str, Any], *args, pretrained_model_name = kwargs['pretrained_model'] self.__load_check_point(pretrained_model_name) + self.embedding_model.to(self.device) self.embedding_model.eval() def forward(self, audio): @@ -333,7 +336,7 @@ def forward(self, audio): ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]' # audio shape: [N, T] feature = self.__extract_feature(audio) - embedding = self.embedding_model(feature) + embedding = self.embedding_model(feature.to(self.device)) return embedding.detach().cpu() diff --git a/modelscope/models/audio/sv/ERes2NetV2.py b/modelscope/models/audio/sv/ERes2NetV2.py new file mode 100644 index 000000000..d842a0948 --- /dev/null +++ b/modelscope/models/audio/sv/ERes2NetV2.py @@ -0,0 +1,345 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" + To further improve the short-duration feature extraction capability of ERes2Net, + we expand the channel dimension within each stage. However, this modification also + increases the number of model parameters and computational complexity. + To alleviate this problem, we propose an improved ERes2NetV2 by pruning redundant structures, + ultimately reducing both the model parameters and its computational cost. +""" + +import math +import os +from typing import Any, Dict, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio.compliance.kaldi as Kaldi + +import modelscope.models.audio.sv.pooling_layers as pooling_layers +from modelscope.metainfo import Models +from modelscope.models import MODELS, TorchModel +from modelscope.models.audio.sv.fusion import AFF +from modelscope.utils.constant import Tasks +from modelscope.utils.device import create_device + + +class ReLU(nn.Hardtanh): + + def __init__(self, inplace=False): + super(ReLU, self).__init__(0, 20, inplace) + + def __repr__(self): + inplace_str = 'inplace' if self.inplace else '' + return self.__class__.__name__ + ' (' \ + + inplace_str + ')' + + +class BasicBlockERes2NetV2(nn.Module): + + def __init__(self, + in_planes, + planes, + stride=1, + baseWidth=26, + scale=2, + expansion=2): + super(BasicBlockERes2NetV2, self).__init__() + width = int(math.floor(planes * (baseWidth / 64.0))) + self.width = width + self.conv1 = nn.Conv2d( + in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) + self.nums = scale + self.expansion = expansion + + convs = [] + bns = [] + for i in range(self.nums): + convs.append( + nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + self.relu = ReLU(inplace=True) + + self.conv3 = nn.Conv2d( + width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, + self.expansion * planes, + kernel_size=1, + stride=stride, + bias=False), nn.BatchNorm2d(self.expansion * planes)) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + spx = torch.split(out, self.width, 1) + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(self.bns[i](sp)) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) + + out = self.conv3(out) + out = self.bn3(out) + + residual = self.shortcut(x) + out += residual + out = self.relu(out) + + return out + + +class BasicBlockERes2NetV2AFF(nn.Module): + + def __init__(self, + in_planes, + planes, + stride=1, + baseWidth=26, + scale=2, + expansion=2): + super(BasicBlockERes2NetV2AFF, self).__init__() + width = int(math.floor(planes * (baseWidth / 64.0))) + self.width = width + self.conv1 = nn.Conv2d( + in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) + self.nums = scale + self.expansion = expansion + + convs = [] + fuse_models = [] + bns = [] + for i in range(self.nums): + convs.append( + nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) + for j in range(self.nums - 1): + fuse_models.append(AFF(channels=width, r=4)) + + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + self.fuse_models = nn.ModuleList(fuse_models) + self.relu = ReLU(inplace=True) + + self.conv3 = nn.Conv2d( + width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, + self.expansion * planes, + kernel_size=1, + stride=stride, + bias=False), nn.BatchNorm2d(self.expansion * planes)) + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + spx = torch.split(out, self.width, 1) + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = self.fuse_models[i - 1](sp, spx[i]) + + sp = self.convs[i](sp) + sp = self.relu(self.bns[i](sp)) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) + + out = self.conv3(out) + out = self.bn3(out) + + residual = self.shortcut(x) + out += residual + out = self.relu(out) + + return out + + +class ERes2NetV2(nn.Module): + + def __init__(self, + block=BasicBlockERes2NetV2, + block_fuse=BasicBlockERes2NetV2AFF, + num_blocks=[3, 4, 6, 3], + m_channels=64, + feat_dim=80, + embed_dim=192, + baseWidth=26, + scale=2, + expansion=2, + pooling_func='TSTP', + two_emb_layer=False): + super(ERes2NetV2, self).__init__() + self.in_planes = m_channels + self.feat_dim = feat_dim + self.embed_dim = embed_dim + self.stats_dim = int(feat_dim / 8) * m_channels * 8 + self.two_emb_layer = two_emb_layer + self.baseWidth = baseWidth + self.scale = scale + self.expansion = expansion + + self.conv1 = nn.Conv2d( + 1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(m_channels) + self.layer1 = self._make_layer( + block, m_channels, num_blocks[0], stride=1) + self.layer2 = self._make_layer( + block, m_channels * 2, num_blocks[1], stride=2) + self.layer3 = self._make_layer( + block_fuse, m_channels * 4, num_blocks[2], stride=2) + self.layer4 = self._make_layer( + block_fuse, m_channels * 8, num_blocks[3], stride=2) + + # Downsampling module + self.layer3_ds = nn.Conv2d( + m_channels * 4 * self.expansion, + m_channels * 8 * self.expansion, + kernel_size=3, + padding=1, + stride=2, + bias=False) + + # Bottom-up fusion module + self.fuse34 = AFF(channels=m_channels * 8 * self.expansion, r=4) + + self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2 + self.pool = getattr(pooling_layers, pooling_func)( + in_dim=self.stats_dim * self.expansion) + self.seg_1 = nn.Linear(self.stats_dim * self.expansion * self.n_stats, + embed_dim) + if self.two_emb_layer: + self.seg_bn_1 = nn.BatchNorm1d(embed_dim, affine=False) + self.seg_2 = nn.Linear(embed_dim, embed_dim) + else: + self.seg_bn_1 = nn.Identity() + self.seg_2 = nn.Identity() + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append( + block( + self.in_planes, + planes, + stride, + baseWidth=self.baseWidth, + scale=self.scale, + expansion=self.expansion)) + self.in_planes = planes * self.expansion + return nn.Sequential(*layers) + + def forward(self, x): + x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) + x = x.unsqueeze_(1) + out = F.relu(self.bn1(self.conv1(x))) + out1 = self.layer1(out) + out2 = self.layer2(out1) + out3 = self.layer3(out2) + out4 = self.layer4(out3) + out3_ds = self.layer3_ds(out3) + fuse_out34 = self.fuse34(out4, out3_ds) + stats = self.pool(fuse_out34) + + embed_a = self.seg_1(stats) + if self.two_emb_layer: + out = F.relu(embed_a) + out = self.seg_bn_1(out) + embed_b = self.seg_2(out) + return embed_b + else: + return embed_a + + +@MODELS.register_module( + Tasks.speaker_verification, module_name=Models.eres2netv2_sv) +class SpeakerVerificationERes2NetV2(TorchModel): + r"""ERes2NetV2 architecture with local and global feature fusion. ERes2NetV2 is mainly composed + of Bottom-up Dual-stage Feature Fusion (BDFF) and Bottleneck-like Local Feature Fusion (BLFF). + BDFF fuses multi-scale feature maps in bottom-up pathway to obtain global information. + The BLFF extracts localization-preserved speaker features and strengthen the local information interaction. + Args: + model_dir: A model dir. + model_config: The model config. + """ + + def __init__(self, model_dir, model_config: Dict[str, Any], *args, + **kwargs): + super().__init__(model_dir, model_config, *args, **kwargs) + self.model_config = model_config + self.embed_dim = self.model_config['embed_dim'] + self.baseWidth = self.model_config['baseWidth'] + self.scale = self.model_config['scale'] + self.expansion = self.model_config['expansion'] + self.other_config = kwargs + self.feature_dim = 80 + self.device = create_device(self.other_config['device']) + + self.embedding_model = ERes2NetV2( + embed_dim=self.embed_dim, + baseWidth=self.baseWidth, + scale=self.scale, + expansion=self.expansion) + + pretrained_model_name = kwargs['pretrained_model'] + self.__load_check_point(pretrained_model_name) + + self.embedding_model.to(self.device) + self.embedding_model.eval() + + def forward(self, audio): + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) + assert len( + audio.shape + ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]' + # audio shape: [N, T] + feature = self.__extract_feature(audio) + embedding = self.embedding_model(feature.to(self.device)) + + return embedding.detach().cpu() + + def __extract_feature(self, audio): + feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim) + feature = feature - feature.mean(dim=0, keepdim=True) + feature = feature.unsqueeze(0) + return feature + + def __load_check_point(self, pretrained_model_name, device=None): + if not device: + device = torch.device('cpu') + self.embedding_model.load_state_dict( + torch.load( + os.path.join(self.model_dir, pretrained_model_name), + map_location=device), + strict=True) diff --git a/modelscope/models/audio/sv/ERes2Net_aug.py b/modelscope/models/audio/sv/ERes2Net_aug.py index d0739cad2..5540ff3ef 100644 --- a/modelscope/models/audio/sv/ERes2Net_aug.py +++ b/modelscope/models/audio/sv/ERes2Net_aug.py @@ -19,6 +19,7 @@ from modelscope.models import MODELS, TorchModel from modelscope.models.audio.sv.fusion import AFF from modelscope.utils.constant import Tasks +from modelscope.utils.device import create_device class ReLU(nn.Hardtanh): @@ -308,12 +309,13 @@ def __init__(self, model_dir, model_config: Dict[str, Any], *args, self.model_config = model_config self.other_config = kwargs self.feature_dim = 80 - + self.device = create_device(self.other_config['device']) self.embedding_model = ERes2Net_aug() pretrained_model_name = kwargs['pretrained_model'] self.__load_check_point(pretrained_model_name) + self.embedding_model.to(self.device) self.embedding_model.eval() def forward(self, audio): @@ -326,7 +328,7 @@ def forward(self, audio): ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]' # audio shape: [N, T] feature = self.__extract_feature(audio) - embedding = self.embedding_model(feature) + embedding = self.embedding_model(feature.to(self.device)) return embedding.detach().cpu() diff --git a/modelscope/models/audio/sv/Res2Net.py b/modelscope/models/audio/sv/Res2Net.py new file mode 100644 index 000000000..0d26e6014 --- /dev/null +++ b/modelscope/models/audio/sv/Res2Net.py @@ -0,0 +1,234 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" Res2Net implementation is adapted from https://github.com/Res2Net/Res2Net-PretrainedModels. + Res2Net is an advanced neural network architecture that enhances the capabilities of standard ResNets + by incorporating hierarchical residual-like connections. This innovative structure improves + performance across various computer vision tasks, such as image classification and object + detection, without significant computational overhead. + Reference: https://arxiv.org/pdf/1904.01169.pdf + Some modifications from the original architecture: + 1. Smaller kernel size for the input layer + 2. Smaller expansion in BasicBlockRes2Net +""" +import math +import os +from typing import Any, Dict, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio.compliance.kaldi as Kaldi + +import modelscope.models.audio.sv.pooling_layers as pooling_layers +from modelscope.metainfo import Models +from modelscope.models import MODELS, TorchModel +from modelscope.utils.constant import Tasks +from modelscope.utils.device import create_device + + +class ReLU(nn.Hardtanh): + + def __init__(self, inplace=False): + super(ReLU, self).__init__(0, 20, inplace) + + def __repr__(self): + inplace_str = 'inplace' if self.inplace else '' + return self.__class__.__name__ + ' (' \ + + inplace_str + ')' + + +class BasicBlockRes2Net(nn.Module): + expansion = 2 + + def __init__(self, in_planes, planes, stride=1, baseWidth=32, scale=2): + super(BasicBlockRes2Net, self).__init__() + width = int(math.floor(planes * (baseWidth / 64.0))) + self.conv1 = nn.Conv2d( + in_planes, width * scale, kernel_size=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(width * scale) + self.nums = scale - 1 + convs = [] + bns = [] + for i in range(self.nums): + convs.append( + nn.Conv2d(width, width, kernel_size=3, padding=1, bias=False)) + bns.append(nn.BatchNorm2d(width)) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + self.relu = ReLU(inplace=True) + + self.conv3 = nn.Conv2d( + width * scale, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, + self.expansion * planes, + kernel_size=1, + stride=stride, + bias=False), nn.BatchNorm2d(self.expansion * planes)) + self.stride = stride + self.width = width + self.scale = scale + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + spx = torch.split(out, self.width, 1) + for i in range(self.nums): + if i == 0: + sp = spx[i] + else: + sp = sp + spx[i] + sp = self.convs[i](sp) + sp = self.relu(self.bns[i](sp)) + if i == 0: + out = sp + else: + out = torch.cat((out, sp), 1) + + out = torch.cat((out, spx[self.nums]), 1) + + out = self.conv3(out) + out = self.bn3(out) + + residual = self.shortcut(x) + out += residual + out = self.relu(out) + + return out + + +class Res2Net(nn.Module): + + def __init__(self, + block=BasicBlockRes2Net, + num_blocks=[3, 4, 6, 3], + m_channels=32, + feat_dim=80, + embedding_size=192, + pooling_func='TSTP', + two_emb_layer=False): + super(Res2Net, self).__init__() + self.in_planes = m_channels + self.feat_dim = feat_dim + self.embedding_size = embedding_size + self.stats_dim = int(feat_dim / 8) * m_channels * 8 + self.two_emb_layer = two_emb_layer + + self.conv1 = nn.Conv2d( + 1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(m_channels) + + self.layer1 = self._make_layer( + block, m_channels, num_blocks[0], stride=1) + self.layer2 = self._make_layer( + block, m_channels * 2, num_blocks[1], stride=2) + self.layer3 = self._make_layer( + block, m_channels * 4, num_blocks[2], stride=2) + self.layer4 = self._make_layer( + block, m_channels * 8, num_blocks[3], stride=2) + + self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2 + self.pool = getattr(pooling_layers, pooling_func)( + in_dim=self.stats_dim * block.expansion) + self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, + embedding_size) + if self.two_emb_layer: + self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) + self.seg_2 = nn.Linear(embedding_size, embedding_size) + else: + self.seg_bn_1 = nn.Identity() + self.seg_2 = nn.Identity() + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) + + x = x.unsqueeze_(1) + out = F.relu(self.bn1(self.conv1(x))) + out = self.layer1(out) + out = self.layer2(out) + out = self.layer3(out) + out = self.layer4(out) + + stats = self.pool(out) + + embed_a = self.seg_1(stats) + if self.two_emb_layer: + out = F.relu(embed_a) + out = self.seg_bn_1(out) + embed_b = self.seg_2(out) + return embed_b + else: + return embed_a + + +@MODELS.register_module( + Tasks.speaker_verification, module_name=Models.res2net_sv) +class SpeakerVerificationResNet(TorchModel): + r""" + Args: + model_dir: A model dir. + model_config: The model config. + """ + + def __init__(self, model_dir, model_config: Dict[str, Any], *args, + **kwargs): + super().__init__(model_dir, model_config, *args, **kwargs) + self.model_config = model_config + self.embed_dim = self.model_config['embed_dim'] + self.m_channels = self.model_config['channels'] + self.other_config = kwargs + self.feature_dim = 80 + self.device = create_device(self.other_config['device']) + + self.embedding_model = Res2Net( + embedding_size=self.embed_dim, m_channels=self.m_channels) + + pretrained_model_name = kwargs['pretrained_model'] + self.__load_check_point(pretrained_model_name) + + self.embedding_model.to(self.device) + self.embedding_model.eval() + + def forward(self, audio): + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) + assert len( + audio.shape + ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]' + # audio shape: [N, T] + feature = self.__extract_feature(audio) + embedding = self.embedding_model(feature.to(self.device)) + + return embedding.detach().cpu() + + def __extract_feature(self, audio): + feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim) + feature = feature - feature.mean(dim=0, keepdim=True) + feature = feature.unsqueeze(0) + return feature + + def __load_check_point(self, pretrained_model_name, device=None): + if not device: + device = torch.device('cpu') + self.embedding_model.load_state_dict( + torch.load( + os.path.join(self.model_dir, pretrained_model_name), + map_location=device), + strict=True) diff --git a/modelscope/models/audio/sv/ResNet.py b/modelscope/models/audio/sv/ResNet.py new file mode 100644 index 000000000..94d303b56 --- /dev/null +++ b/modelscope/models/audio/sv/ResNet.py @@ -0,0 +1,186 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" ResNet implementation is adapted from https://github.com/wenet-e2e/wespeaker. + ResNet, or Residual Neural Network, is notable for its optimization ease + and depth-induced accuracy gains. It utilizes skip connections within its residual + blocks to counteract the vanishing gradient problem in deep networks. + Reference: Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun + Deep Residual Learning for Image Recognition. arXiv:1512.03385 +""" +import math +import os +from typing import Any, Dict, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio.compliance.kaldi as Kaldi + +import modelscope.models.audio.sv.pooling_layers as pooling_layers +from modelscope.metainfo import Models +from modelscope.models import MODELS, TorchModel +from modelscope.utils.constant import Tasks +from modelscope.utils.device import create_device + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, in_planes, planes, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = nn.Conv2d( + in_planes, + planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.shortcut = nn.Sequential() + if stride != 1 or in_planes != self.expansion * planes: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_planes, + self.expansion * planes, + kernel_size=1, + stride=stride, + bias=False), nn.BatchNorm2d(self.expansion * planes)) + + def forward(self, x): + out = F.relu(self.bn1(self.conv1(x))) + out = self.bn2(self.conv2(out)) + out += self.shortcut(x) + out = F.relu(out) + return out + + +class ResNet(nn.Module): + + def __init__(self, + block=BasicBlock, + num_blocks=[3, 4, 6, 3], + m_channels=32, + feat_dim=80, + embedding_size=128, + pooling_func='TSTP', + two_emb_layer=True): + super(ResNet, self).__init__() + self.in_planes = m_channels + self.feat_dim = feat_dim + self.embedding_size = embedding_size + self.stats_dim = int(feat_dim / 8) * m_channels * 8 + self.two_emb_layer = two_emb_layer + + self.conv1 = nn.Conv2d( + 1, m_channels, kernel_size=3, stride=1, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(m_channels) + + self.layer1 = self._make_layer( + block, m_channels, num_blocks[0], stride=1) + self.layer2 = self._make_layer( + block, m_channels * 2, num_blocks[1], stride=2) + self.layer3 = self._make_layer( + block, m_channels * 4, num_blocks[2], stride=2) + self.layer4 = self._make_layer( + block, m_channels * 8, num_blocks[3], stride=2) + + self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2 + self.pool = getattr(pooling_layers, pooling_func)( + in_dim=self.stats_dim * block.expansion) + self.seg_1 = nn.Linear(self.stats_dim * block.expansion * self.n_stats, + embedding_size) + if self.two_emb_layer: + self.seg_bn_1 = nn.BatchNorm1d(embedding_size, affine=False) + self.seg_2 = nn.Linear(embedding_size, embedding_size) + else: + self.seg_bn_1 = nn.Identity() + self.seg_2 = nn.Identity() + + def _make_layer(self, block, planes, num_blocks, stride): + strides = [stride] + [1] * (num_blocks - 1) + layers = [] + for stride in strides: + layers.append(block(self.in_planes, planes, stride)) + self.in_planes = planes * block.expansion + return nn.Sequential(*layers) + + def forward(self, x): + x = x.permute(0, 2, 1) # (B,T,F) => (B,F,T) + x = x.unsqueeze_(1) + out = F.relu(self.bn1(self.conv1(x))) + out1 = self.layer1(out) + out2 = self.layer2(out1) + out3 = self.layer3(out2) + out = self.layer4(out3) + stats = self.pool(out) + + embed_a = self.seg_1(stats) + if self.two_emb_layer: + out = F.relu(embed_a) + out = self.seg_bn_1(out) + embed_b = self.seg_2(out) + return embed_b + else: + return embed_a + + +@MODELS.register_module( + Tasks.speaker_verification, module_name=Models.resnet_sv) +class SpeakerVerificationResNet(TorchModel): + r""" + Args: + model_dir: A model dir. + model_config: The model config. + """ + + def __init__(self, model_dir, model_config: Dict[str, Any], *args, + **kwargs): + super().__init__(model_dir, model_config, *args, **kwargs) + self.model_config = model_config + self.embed_dim = self.model_config['embed_dim'] + self.m_channels = self.model_config['channels'] + self.other_config = kwargs + self.feature_dim = 80 + self.device = create_device(self.other_config['device']) + + self.embedding_model = ResNet( + embedding_size=self.embed_dim, m_channels=self.m_channels) + + pretrained_model_name = kwargs['pretrained_model'] + self.__load_check_point(pretrained_model_name) + + self.embedding_model.to(self.device) + self.embedding_model.eval() + + def forward(self, audio): + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) + assert len( + audio.shape + ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]' + # audio shape: [N, T] + feature = self.__extract_feature(audio) + embedding = self.embedding_model(feature.to(self.device)) + + return embedding.detach().cpu() + + def __extract_feature(self, audio): + feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim) + feature = feature - feature.mean(dim=0, keepdim=True) + feature = feature.unsqueeze(0) + return feature + + def __load_check_point(self, pretrained_model_name, device=None): + if not device: + device = torch.device('cpu') + self.embedding_model.load_state_dict( + torch.load( + os.path.join(self.model_dir, pretrained_model_name), + map_location=device), + strict=True) diff --git a/modelscope/models/audio/sv/lanuage_recognition_eres2net.py b/modelscope/models/audio/sv/lanuage_recognition_eres2net.py index 0876cd2e5..927d9b00f 100644 --- a/modelscope/models/audio/sv/lanuage_recognition_eres2net.py +++ b/modelscope/models/audio/sv/lanuage_recognition_eres2net.py @@ -92,9 +92,9 @@ def forward(self, audio): # audio shape: [N, T] feature = self._extract_feature(audio) embs = self.encoder(feature.to(self.device)) - output = self.backend(embs) - output = output.detach().cpu().argmax(-1) - return output + scores = self.backend(embs).detach() + output = scores.cpu().argmax(-1) + return scores, output def _extract_feature(self, audio): features = [] diff --git a/modelscope/models/audio/sv/lanuage_recognition_model.py b/modelscope/models/audio/sv/lanuage_recognition_model.py index 3ab531282..1f7da7605 100644 --- a/modelscope/models/audio/sv/lanuage_recognition_model.py +++ b/modelscope/models/audio/sv/lanuage_recognition_model.py @@ -89,9 +89,9 @@ def forward(self, audio): # audio shape: [N, T] feature = self._extract_feature(audio) embs = self.encoder(feature.to(self.device)) - output = self.backend(embs) - output = output.detach().cpu().argmax(-1) - return output + scores = self.backend(embs).detach() + output = scores.cpu().argmax(-1) + return scores, output def _extract_feature(self, audio): features = [] diff --git a/modelscope/models/audio/sv/sdpn.py b/modelscope/models/audio/sv/sdpn.py new file mode 100644 index 000000000..2c279e9d7 --- /dev/null +++ b/modelscope/models/audio/sv/sdpn.py @@ -0,0 +1,614 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" This ECAPA-TDNN implementation is adapted from https://github.com/speechbrain/speechbrain. + Self-Distillation Prototypes Network(SDPN) is a self-supervised learning framwork in SV. + It comprises a teacher and a student network with identical architecture + but different parameters. Teacher/student network consists of three main modules: + the encoder for extracting speaker embeddings, multi-layer perceptron for + feature transformation, and prototypes for computing soft-distributions between + global and local views. EMA denotes Exponential Moving Average. +""" +import math +import os +from typing import Any, Dict, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio.compliance.kaldi as Kaldi + +from modelscope.metainfo import Models +from modelscope.models import MODELS, TorchModel +from modelscope.utils.constant import Tasks + + +def length_to_mask(length, max_len=None, dtype=None, device=None): + assert len(length.shape) == 1 + + if max_len is None: + max_len = length.max().long().item() + mask = torch.arange( + max_len, device=length.device, dtype=length.dtype).expand( + len(length), max_len) < length.unsqueeze(1) + + if dtype is None: + dtype = length.dtype + + if device is None: + device = length.device + + mask = torch.as_tensor(mask, dtype=dtype, device=device) + return mask + + +def get_padding_elem(L_in: int, stride: int, kernel_size: int, dilation: int): + if stride > 1: + n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) + L_out = stride * (n_steps - 1) + kernel_size * dilation + padding = [kernel_size // 2, kernel_size // 2] + + else: + L_out = (L_in - dilation * (kernel_size - 1) - 1) // stride + 1 + + padding = [(L_in - L_out) // 2, (L_in - L_out) // 2] + return padding + + +class Conv1d(nn.Module): + + def __init__( + self, + out_channels, + kernel_size, + in_channels, + stride=1, + dilation=1, + padding='same', + groups=1, + bias=True, + padding_mode='reflect', + ): + super().__init__() + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.padding = padding + self.padding_mode = padding_mode + + self.conv = nn.Conv1d( + in_channels, + out_channels, + self.kernel_size, + stride=self.stride, + dilation=self.dilation, + padding=0, + groups=groups, + bias=bias, + ) + + def forward(self, x): + if self.padding == 'same': + x = self._manage_padding(x, self.kernel_size, self.dilation, + self.stride) + + elif self.padding == 'causal': + num_pad = (self.kernel_size - 1) * self.dilation + x = F.pad(x, (num_pad, 0)) + + elif self.padding == 'valid': + pass + + else: + raise ValueError( + "Padding must be 'same', 'valid' or 'causal'. Got " + + self.padding) + + wx = self.conv(x) + + return wx + + def _manage_padding( + self, + x, + kernel_size: int, + dilation: int, + stride: int, + ): + L_in = x.shape[-1] + padding = get_padding_elem(L_in, stride, kernel_size, dilation) + x = F.pad(x, padding, mode=self.padding_mode) + + return x + + +class BatchNorm1d(nn.Module): + + def __init__( + self, + input_size, + eps=1e-05, + momentum=0.1, + ): + super().__init__() + self.norm = nn.BatchNorm1d( + input_size, + eps=eps, + momentum=momentum, + ) + + def forward(self, x): + return self.norm(x) + + +class TDNNBlock(nn.Module): + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + dilation, + activation=nn.ReLU, + groups=1, + ): + super(TDNNBlock, self).__init__() + self.conv = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + dilation=dilation, + groups=groups, + ) + self.activation = activation() + self.norm = BatchNorm1d(input_size=out_channels) + + def forward(self, x): + return self.norm(self.activation(self.conv(x))) + + +class Res2NetBlock(torch.nn.Module): + + def __init__(self, + in_channels, + out_channels, + scale=8, + kernel_size=3, + dilation=1): + super(Res2NetBlock, self).__init__() + assert in_channels % scale == 0 + assert out_channels % scale == 0 + + in_channel = in_channels // scale + hidden_channel = out_channels // scale + + self.blocks = nn.ModuleList([ + TDNNBlock( + in_channel, + hidden_channel, + kernel_size=kernel_size, + dilation=dilation, + ) for i in range(scale - 1) + ]) + self.scale = scale + + def forward(self, x): + y = [] + for i, x_i in enumerate(torch.chunk(x, self.scale, dim=1)): + if i == 0: + y_i = x_i + elif i == 1: + y_i = self.blocks[i - 1](x_i) + else: + y_i = self.blocks[i - 1](x_i + y_i) + y.append(y_i) + y = torch.cat(y, dim=1) + return y + + +class SEBlock(nn.Module): + + def __init__(self, in_channels, se_channels, out_channels): + super(SEBlock, self).__init__() + + self.conv1 = Conv1d( + in_channels=in_channels, out_channels=se_channels, kernel_size=1) + self.relu = torch.nn.ReLU(inplace=True) + self.conv2 = Conv1d( + in_channels=se_channels, out_channels=out_channels, kernel_size=1) + self.sigmoid = torch.nn.Sigmoid() + + def forward(self, x, lengths=None): + L = x.shape[-1] + if lengths is not None: + mask = length_to_mask(lengths * L, max_len=L, device=x.device) + mask = mask.unsqueeze(1) + total = mask.sum(dim=2, keepdim=True) + s = (x * mask).sum(dim=2, keepdim=True) / total + else: + s = x.mean(dim=2, keepdim=True) + + s = self.relu(self.conv1(s)) + s = self.sigmoid(self.conv2(s)) + + return s * x + + +class AttentiveStatisticsPooling(nn.Module): + + def __init__(self, channels, attention_channels=128, global_context=True): + super().__init__() + + self.eps = 1e-12 + self.global_context = global_context + if global_context: + self.tdnn = TDNNBlock(channels * 3, attention_channels, 1, 1) + else: + self.tdnn = TDNNBlock(channels, attention_channels, 1, 1) + self.tanh = nn.Tanh() + self.conv = Conv1d( + in_channels=attention_channels, + out_channels=channels, + kernel_size=1) + + def forward(self, x, lengths=None): + L = x.shape[-1] + + def _compute_statistics(x, m, dim=2, eps=self.eps): + mean = (m * x).sum(dim) + std = torch.sqrt( + (m * (x - mean.unsqueeze(dim)).pow(2)).sum(dim).clamp(eps)) + return mean, std + + if lengths is None: + lengths = torch.ones(x.shape[0], device=x.device) + + # Make binary mask of shape [N, 1, L] + mask = length_to_mask(lengths * L, max_len=L, device=x.device) + mask = mask.unsqueeze(1) + + # Expand the temporal context of the pooling layer by allowing the + # self-attention to look at global properties of the utterance. + if self.global_context: + # torch.std is unstable for backward computation + # https://github.com/pytorch/pytorch/issues/4320 + total = mask.sum(dim=2, keepdim=True).float() + mean, std = _compute_statistics(x, mask / total) + mean = mean.unsqueeze(2).repeat(1, 1, L) + std = std.unsqueeze(2).repeat(1, 1, L) + attn = torch.cat([x, mean, std], dim=1) + else: + attn = x + + # Apply layers + attn = self.conv(self.tanh(self.tdnn(attn))) + + # Filter out zero-paddings + attn = attn.masked_fill(mask == 0, float('-inf')) + + attn = F.softmax(attn, dim=2) + mean, std = _compute_statistics(x, attn) + # Append mean and std of the batch + pooled_stats = torch.cat((mean, std), dim=1) + pooled_stats = pooled_stats.unsqueeze(2) + + return pooled_stats + + +class SERes2NetBlock(nn.Module): + + def __init__( + self, + in_channels, + out_channels, + res2net_scale=8, + se_channels=128, + kernel_size=1, + dilation=1, + activation=torch.nn.ReLU, + groups=1, + ): + super().__init__() + self.out_channels = out_channels + self.tdnn1 = TDNNBlock( + in_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, + groups=groups, + ) + self.res2net_block = Res2NetBlock(out_channels, out_channels, + res2net_scale, kernel_size, dilation) + self.tdnn2 = TDNNBlock( + out_channels, + out_channels, + kernel_size=1, + dilation=1, + activation=activation, + groups=groups, + ) + self.se_block = SEBlock(out_channels, se_channels, out_channels) + + self.shortcut = None + if in_channels != out_channels: + self.shortcut = Conv1d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + ) + + def forward(self, x, lengths=None): + residual = x + if self.shortcut: + residual = self.shortcut(x) + + x = self.tdnn1(x) + x = self.res2net_block(x) + x = self.tdnn2(x) + x = self.se_block(x, lengths) + + return x + residual + + +class ECAPA_TDNN(nn.Module): + """An implementation of the speaker embedding model in a paper. + "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in + TDNN Based Speaker Verification" (https://arxiv.org/abs/2005.07143). + """ + + def __init__( + self, + input_size, + device='cpu', + lin_neurons=512, + activation=torch.nn.ReLU, + channels=[512, 512, 512, 512, 1536], + kernel_sizes=[5, 3, 3, 3, 1], + dilations=[1, 2, 3, 4, 1], + attention_channels=128, + res2net_scale=8, + se_channels=128, + global_context=True, + groups=[1, 1, 1, 1, 1], + ): + + super().__init__() + assert len(channels) == len(kernel_sizes) + assert len(channels) == len(dilations) + self.channels = channels + self.blocks = nn.ModuleList() + + # The initial TDNN layer + self.blocks.append( + TDNNBlock( + input_size, + channels[0], + kernel_sizes[0], + dilations[0], + activation, + groups[0], + )) + + # SE-Res2Net layers + for i in range(1, len(channels) - 1): + self.blocks.append( + SERes2NetBlock( + channels[i - 1], + channels[i], + res2net_scale=res2net_scale, + se_channels=se_channels, + kernel_size=kernel_sizes[i], + dilation=dilations[i], + activation=activation, + groups=groups[i], + )) + + # Multi-layer feature aggregation + self.mfa = TDNNBlock( + channels[-1], + channels[-1], + kernel_sizes[-1], + dilations[-1], + activation, + groups=groups[-1], + ) + + # Attentive Statistical Pooling + self.asp = AttentiveStatisticsPooling( + channels[-1], + attention_channels=attention_channels, + global_context=global_context, + ) + self.asp_bn = BatchNorm1d(input_size=channels[-1] * 2) + + # Final linear transformation + self.fc = Conv1d( + in_channels=channels[-1] * 2, + out_channels=lin_neurons, + kernel_size=1, + ) + + def forward(self, x, lengths=None): + """Returns the embedding vector. + + Arguments + --------- + x : torch.Tensor + Tensor of shape (batch, time, channel). + """ + x = x.transpose(1, 2) + + xl = [] + for layer in self.blocks: + try: + x = layer(x, lengths=lengths) + except TypeError: + x = layer(x) + xl.append(x) + + # Multi-layer feature aggregation + x = torch.cat(xl[1:], dim=1) + x = self.mfa(x) + + # Attentive Statistical Pooling + x = self.asp(x, lengths=lengths) + x = self.asp_bn(x) + + # Final linear transformation + x = self.fc(x) + + x = x.transpose(1, 2).squeeze(1) + return x + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_.' + 'The distribution of values may be incorrect.', + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l_ = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l_, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l_ - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + # type: (Tensor, float, float, float, float) -> Tensor + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + + +class SDPNHead(nn.Module): + + def __init__(self, + in_dim, + use_bn=False, + nlayers=3, + hidden_dim=2048, + bottleneck_dim=256): + super().__init__() + nlayers = max(nlayers, 1) + if nlayers == 1: + self.mlp = nn.Linear(in_dim, bottleneck_dim) + else: + layers = [nn.Linear(in_dim, hidden_dim)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim)) + self.mlp = nn.Sequential(*layers) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + x = nn.functional.normalize(x, dim=-1, p=2) + return x + + +class Combiner(torch.nn.Module): + """ + Combine backbone (ECAPA) and head (MLP) + """ + + def __init__(self, backbone, head): + super(Combiner, self).__init__() + self.backbone = backbone + self.head = head + + def forward(self, x): + x = self.backbone(x) + output = self.head(x) + return x, output + + +@MODELS.register_module(Tasks.speaker_verification, module_name=Models.sdpn_sv) +class SpeakerVerificationSDPN(TorchModel): + """ + Self-Distillation Prototypes Network (SDPN) effectively facilitates + self-supervised speaker representation learning. The specific structure can be + referred to in https://arxiv.org/pdf/2308.02774. + """ + + def __init__(self, model_dir, model_config: Dict[str, Any], *args, + **kwargs): + super().__init__(model_dir, model_config, *args, **kwargs) + self.model_config = model_config + self.other_config = kwargs + if self.model_config['channel'] != 1024: + raise ValueError( + 'modelscope error: Currently only 1024-channel ecapa tdnn is supported.' + ) + + self.feature_dim = 80 + channels_config = [1024, 1024, 1024, 1024, 3072] + + self.embedding_model = ECAPA_TDNN( + self.feature_dim, channels=channels_config) + self.embedding_model = Combiner(self.embedding_model, + SDPNHead(512, True)) + + pretrained_model_name = kwargs['pretrained_model'] + self.__load_check_point(pretrained_model_name) + + self.embedding_model.eval() + + def forward(self, audio): + assert len(audio.shape) == 2 and audio.shape[ + 0] == 1, 'modelscope error: the shape of input audio to model needs to be [1, T]' + # audio shape: [1, T] + feature = self.__extract_feature(audio) + embedding = self.embedding_model.backbone(feature) + + return embedding + + def __extract_feature(self, audio): + feature = Kaldi.fbank(audio, num_mel_bins=self.feature_dim) + feature = feature - feature.mean(dim=0, keepdim=True) + feature = feature.unsqueeze(0) + return feature + + def __load_check_point(self, pretrained_model_name, device=None): + if not device: + device = torch.device('cpu') + state_dict = torch.load( + os.path.join(self.model_dir, pretrained_model_name), + map_location=device) + state_dict_tea = { + k.replace('module.', ''): v + for k, v in state_dict['teacher'].items() + } + self.embedding_model.load_state_dict(state_dict_tea, strict=True) diff --git a/modelscope/models/audio/sv/xvector.py b/modelscope/models/audio/sv/xvector.py new file mode 100644 index 000000000..4a4c15a4a --- /dev/null +++ b/modelscope/models/audio/sv/xvector.py @@ -0,0 +1,153 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +""" + This TDNN implementation is adapted from https://github.com/wenet-e2e/wespeaker. + TDNN replaces i-vectors for text-independent speaker verification with embeddings + extracted from a feedforward deep neural network. The specific structure can be + referred to in https://www.danielpovey.com/files/2017_interspeech_embeddings.pdf. +""" +import math +import os +from typing import Any, Dict, Union + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchaudio.compliance.kaldi as Kaldi + +import modelscope.models.audio.sv.pooling_layers as pooling_layers +from modelscope.metainfo import Models +from modelscope.models import MODELS, TorchModel +from modelscope.utils.constant import Tasks +from modelscope.utils.device import create_device + + +class TdnnLayer(nn.Module): + + def __init__(self, in_dim, out_dim, context_size, dilation=1, padding=0): + """Define the TDNN layer, essentially 1-D convolution + + Args: + in_dim (int): input dimension + out_dim (int): output channels + context_size (int): context size, essentially the filter size + dilation (int, optional): Defaults to 1. + padding (int, optional): Defaults to 0. + """ + super(TdnnLayer, self).__init__() + self.in_dim = in_dim + self.out_dim = out_dim + self.context_size = context_size + self.dilation = dilation + self.padding = padding + self.conv_1d = nn.Conv1d( + self.in_dim, + self.out_dim, + self.context_size, + dilation=self.dilation, + padding=self.padding) + + # Set Affine=false to be compatible with the original kaldi version + self.bn = nn.BatchNorm1d(out_dim, affine=False) + + def forward(self, x): + out = self.conv_1d(x) + out = F.relu(out) + out = self.bn(out) + return out + + +class XVEC(nn.Module): + + def __init__(self, + feat_dim=40, + hid_dim=512, + stats_dim=1500, + embed_dim=512, + pooling_func='TSTP'): + """ + Implementation of Kaldi style xvec, as described in + X-VECTORS: ROBUST DNN EMBEDDINGS FOR SPEAKER RECOGNITION + """ + super(XVEC, self).__init__() + self.feat_dim = feat_dim + self.stats_dim = stats_dim + self.embed_dim = embed_dim + + self.frame_1 = TdnnLayer(feat_dim, hid_dim, context_size=5, dilation=1) + self.frame_2 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=2) + self.frame_3 = TdnnLayer(hid_dim, hid_dim, context_size=3, dilation=3) + self.frame_4 = TdnnLayer(hid_dim, hid_dim, context_size=1, dilation=1) + self.frame_5 = TdnnLayer( + hid_dim, stats_dim, context_size=1, dilation=1) + self.n_stats = 1 if pooling_func == 'TAP' or pooling_func == 'TSDP' else 2 + self.pool = getattr(pooling_layers, pooling_func)( + in_dim=self.stats_dim) + self.seg_1 = nn.Linear(self.stats_dim * self.n_stats, embed_dim) + + def forward(self, x): + x = x.permute(0, 2, 1) # (B,T,F) -> (B,F,T) + + out = self.frame_1(x) + out = self.frame_2(out) + out = self.frame_3(out) + out = self.frame_4(out) + out = self.frame_5(out) + + stats = self.pool(out) + embed_a = self.seg_1(stats) + return embed_a + + +@MODELS.register_module(Tasks.speaker_verification, module_name=Models.tdnn_sv) +class SpeakerVerificationTDNN(TorchModel): + + def __init__(self, model_dir, model_config: Dict[str, Any], *args, + **kwargs): + super().__init__(model_dir, model_config, *args, **kwargs) + self.model_config = model_config + self.other_config = kwargs + + self.feature_dim = 80 + self.embed_dim = 512 + self.device = create_device(self.other_config['device']) + print(self.device) + + self.embedding_model = XVEC( + feat_dim=self.feature_dim, embed_dim=self.embed_dim) + pretrained_model_name = kwargs['pretrained_model'] + self.__load_check_point(pretrained_model_name) + + self.embedding_model.to(self.device) + self.embedding_model.eval() + + def forward(self, audio): + if isinstance(audio, np.ndarray): + audio = torch.from_numpy(audio) + if len(audio.shape) == 1: + audio = audio.unsqueeze(0) + assert len( + audio.shape + ) == 2, 'modelscope error: the shape of input audio to model needs to be [N, T]' + # audio shape: [N, T] + feature = self.__extract_feature(audio) + embedding = self.embedding_model(feature.to(self.device)) + + return embedding.detach().cpu() + + def __extract_feature(self, audio): + features = [] + for au in audio: + feature = Kaldi.fbank( + au.unsqueeze(0), num_mel_bins=self.feature_dim) + feature = feature - feature.mean(dim=0, keepdim=True) + features.append(feature.unsqueeze(0)) + features = torch.cat(features) + return features + + def __load_check_point(self, pretrained_model_name): + self.embedding_model.load_state_dict( + torch.load( + os.path.join(self.model_dir, pretrained_model_name), + map_location=torch.device('cpu')), + strict=True) diff --git a/modelscope/models/audio/tts/__init__.py b/modelscope/models/audio/tts/__init__.py index 8af35c5a3..38420985d 100644 --- a/modelscope/models/audio/tts/__init__.py +++ b/modelscope/models/audio/tts/__init__.py @@ -5,9 +5,13 @@ if TYPE_CHECKING: from .sambert_hifi import SambertHifigan + from .laura_codec import LauraCodecGenModel else: - _import_structure = {'sambert_hifi': ['SambertHifigan']} + _import_structure = { + 'sambert_hifi': ['SambertHifigan'], + 'laura_codec': ['LauraCodecGenModel'], + } import sys sys.modules[__name__] = LazyImportModule( __name__, diff --git a/modelscope/models/audio/punc/generic_punctuation.py b/modelscope/models/audio/tts/laura_codec.py similarity index 52% rename from modelscope/models/audio/punc/generic_punctuation.py rename to modelscope/models/audio/tts/laura_codec.py index dabb60905..0e50321ce 100644 --- a/modelscope/models/audio/punc/generic_punctuation.py +++ b/modelscope/models/audio/tts/laura_codec.py @@ -8,30 +8,31 @@ from modelscope.models.builder import MODELS from modelscope.utils.constant import Frameworks, Tasks +__all__ = ['LauraCodecGenModel'] -@MODELS.register_module(Tasks.punctuation, module_name=Models.generic_punc) -class PunctuationProcessing(Model): - def __init__(self, model_dir: str, punc_model_name: str, - punc_model_config: Dict[str, Any], *args, **kwargs): +@MODELS.register_module(Tasks.text_to_speech, module_name=Models.laura_codec) +class LauraCodecGenModel(Model): + + def __init__(self, model_dir: str, model_name: str, + model_config: Dict[str, Any], *args, **kwargs): """initialize the info of model. Args: model_dir (str): the model path. - punc_model_name (str): the itn model name from configuration.json - punc_model_config (Dict[str, Any]): the detail config about model from configuration.json + model_name (str): the itn model name from configuration.json + model_config (Dict[str, Any]): the detail config about model from configuration.json """ - super().__init__(model_dir, punc_model_name, punc_model_config, *args, - **kwargs) + super().__init__(model_dir, model_name, model_config, *args, **kwargs) self.model_cfg = { # the recognition model dir path 'model_workspace': model_dir, # the itn model name - 'punc_model': punc_model_name, + 'model_name': model_name, # the am model file path - 'punc_model_path': os.path.join(model_dir, punc_model_name), + 'model_path': os.path.join(model_dir, model_name), # the recognition model config dict - 'model_config': punc_model_config + 'model_config': model_config } def forward(self) -> Dict[str, Any]: diff --git a/modelscope/models/builder.py b/modelscope/models/builder.py index b57fba53e..f2bba487c 100644 --- a/modelscope/models/builder.py +++ b/modelscope/models/builder.py @@ -13,7 +13,7 @@ BACKBONES = MODELS HEADS = Registry('heads') -modules = LazyImportModule.AST_INDEX[INDEX_KEY] +modules = LazyImportModule.get_ast_index()[INDEX_KEY] for module_index in list(modules.keys()): if module_index[1] == Tasks.backbone and module_index[0] == 'BACKBONES': modules[(MODELS.name.upper(), module_index[1], diff --git a/modelscope/models/cv/__init__.py b/modelscope/models/cv/__init__.py index 5da87a001..2bf632f8e 100644 --- a/modelscope/models/cv/__init__.py +++ b/modelscope/models/cv/__init__.py @@ -4,14 +4,16 @@ from . import (action_recognition, animal_recognition, bad_image_detecting, body_2d_keypoints, body_3d_keypoints, cartoon, cmdssl_video_embedding, controllable_image_generation, - crowd_counting, face_detection, face_generation, - face_reconstruction, human3d_animation, human_reconstruction, - image_classification, image_color_enhance, image_colorization, - image_defrcn_fewshot, image_denoise, image_editing, - image_inpainting, image_instance_segmentation, image_matching, - image_mvs_depth_estimation, image_panoptic_segmentation, - image_portrait_enhancement, image_probing_model, - image_quality_assessment_degradation, + crowd_counting, dense_optical_flow_estimation, face_detection, + face_generation, face_reconstruction, human3d_animation, + human_reconstruction, image_classification, image_color_enhance, + image_colorization, image_defrcn_fewshot, image_denoise, + image_editing, image_inpainting, image_instance_segmentation, + image_local_feature_matching, image_matching, + image_matching_fast, image_mvs_depth_estimation, + image_mvs_depth_estimation_geomvsnet, + image_panoptic_segmentation, image_portrait_enhancement, + image_probing_model, image_quality_assessment_degradation, image_quality_assessment_man, image_quality_assessment_mos, image_reid_person, image_restoration, image_semantic_segmentation, image_super_resolution_pasd, diff --git a/modelscope/models/cv/action_detection/modules/resnet.py b/modelscope/models/cv/action_detection/modules/resnet.py index 7f5529a48..435aea528 100644 --- a/modelscope/models/cv/action_detection/modules/resnet.py +++ b/modelscope/models/cv/action_detection/modules/resnet.py @@ -233,7 +233,7 @@ def __init__(self, ops=ops[sum(layers[:3], 0):][:layers[3]]) if num_classes is not None: self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.sptial_atten = nn.Conv2d(2, 1, kernel_size=7, padding=3) + self.spatial_atten = nn.Conv2d(2, 1, kernel_size=7, padding=3) self.drop = nn.Dropout(0.5) if reduce_dim > 0: self.rd_conv = nn.Conv2d( @@ -308,7 +308,7 @@ def features(self, x): ftr = torch.cat( (x.max(dim=1, keepdim=True)[0], x.mean(dim=1, keepdim=True)), dim=1) - score = self.sptial_atten(ftr) # N,1,H,W + score = self.spatial_atten(ftr) # N,1,H,W x = x * torch.sigmoid(score) # N,C,H,W self.score = score diff --git a/modelscope/models/cv/action_recognition/s3dg.py b/modelscope/models/cv/action_recognition/s3dg.py index 46e768927..fa271b471 100644 --- a/modelscope/models/cv/action_recognition/s3dg.py +++ b/modelscope/models/cv/action_recognition/s3dg.py @@ -1,5 +1,5 @@ # The implementation is adopted from https://github.com/TengdaHan/CoCLR, -# made pubicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR +# made publicly available under the Apache License, Version 2.0 at https://github.com/TengdaHan/CoCLR # Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved. import torch import torch.nn as nn @@ -47,7 +47,7 @@ class InceptionBlock3D(nn.Module): Element constructing the S3D/S3DG. See models/base/backbone.py L99-186. - Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. + Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. """ def __init__(self, cfg, in_planes, out_planes): @@ -139,7 +139,7 @@ class STConv3d(nn.Module): Element constructing the S3D/S3DG. See models/base/backbone.py L99-186. - Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. + Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. """ def __init__(self, @@ -213,7 +213,7 @@ def forward(self, x): class Inception3D(nn.Module): """ Backbone architecture for I3D/S3DG. - Modifed from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. + Modified from https://github.com/TengdaHan/CoCLR/blob/main/backbone/s3dg.py. """ def __init__(self, cfg): diff --git a/modelscope/models/cv/action_recognition/tada_convnext.py b/modelscope/models/cv/action_recognition/tada_convnext.py index b1de7af8f..cf9738251 100644 --- a/modelscope/models/cv/action_recognition/tada_convnext.py +++ b/modelscope/models/cv/action_recognition/tada_convnext.py @@ -1,5 +1,5 @@ # The implementation is adopted from https://github.com/facebookresearch/ConvNeXt, -# made pubicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt +# made publicly available under the MIT License at https://github.com/facebookresearch/ConvNeXt # Copyright 2021-2022 The Alibaba FVI Team Authors. All rights reserved. import math diff --git a/modelscope/models/cv/animal_recognition/resnet.py b/modelscope/models/cv/animal_recognition/resnet.py index d7c03c299..44e44722a 100644 --- a/modelscope/models/cv/animal_recognition/resnet.py +++ b/modelscope/models/cv/animal_recognition/resnet.py @@ -1,5 +1,5 @@ # The implementation is adopted from Split-Attention Network, A New ResNet Variant, -# made pubicly available under the Apache License 2.0 License +# made publicly available under the Apache License 2.0 License # at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/resnet.py import math diff --git a/modelscope/models/cv/animal_recognition/splat.py b/modelscope/models/cv/animal_recognition/splat.py index a10d0abe1..09d65b6de 100644 --- a/modelscope/models/cv/animal_recognition/splat.py +++ b/modelscope/models/cv/animal_recognition/splat.py @@ -1,5 +1,5 @@ # The implementation is adopted from Split-Attention Network, A New ResNet Variant, -# made pubicly available under the Apache License 2.0 License +# made publicly available under the Apache License 2.0 License # at https://github.com/zhanghang1989/ResNeSt/blob/master/resnest/torch/models/splat.py """Split-Attention""" diff --git a/modelscope/models/cv/anydoor/__init__.py b/modelscope/models/cv/anydoor/__init__.py new file mode 100644 index 000000000..0eb176c42 --- /dev/null +++ b/modelscope/models/cv/anydoor/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .anydoor_model import ControlLDM + +else: + _import_structure = {'anydoor_model': ['ControlLDM']} + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/anydoor/anydoor_model.py b/modelscope/models/cv/anydoor/anydoor_model.py new file mode 100644 index 000000000..6e9316b74 --- /dev/null +++ b/modelscope/models/cv/anydoor/anydoor_model.py @@ -0,0 +1,519 @@ +import einops +import torch +import torch.nn as nn +from einops import rearrange, repeat +from torchvision.utils import make_grid + +from modelscope import Model +from modelscope.metainfo import Models +from modelscope.models.builder import MODELS +from modelscope.utils.constant import Tasks +from .cldm.ddim_hacked import DDIMSampler +from .ldm.models.diffusion.ddpm import LatentDiffusion +from .ldm.modules.attention import SpatialTransformer +from .ldm.modules.diffusionmodules.openaimodel import (AttentionBlock, + Downsample, ResBlock, + TimestepEmbedSequential, + UNetModel) +from .ldm.modules.diffusionmodules.util import (conv_nd, linear, + timestep_embedding, + zero_module) +from .ldm.util import exists + + +class ControlledUnetModel(UNetModel): + + def forward(self, + x, + timesteps=None, + context=None, + control=None, + only_mid_control=False, + **kwargs): + hs = [] + with torch.no_grad(): + t_emb = timestep_embedding( + timesteps, self.model_channels, repeat_only=False) + emb = self.time_embed(t_emb) + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb, context) + hs.append(h) + h = self.middle_block(h, emb, context) + + if control is not None: + h += control.pop() + + for i, module in enumerate(self.output_blocks): + if only_mid_control or control is None: + h = torch.cat([h, hs.pop()], dim=1) + else: + h = torch.cat([h, hs.pop() + control.pop()], dim=1) + h = module(h, emb, context) + + h = h.type(x.dtype) + return self.out(h) + + +class ControlNet(nn.Module): + + def __init__( + self, + image_size, + in_channels, + model_channels, + hint_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + use_spatial_transformer=False, # custom transformer support + transformer_depth=1, # custom transformer support + context_dim=None, # custom transformer support + n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model + legacy=True, + disable_self_attentions=None, + num_attention_blocks=None, + disable_middle_self_attn=False, + use_linear_in_transformer=False, + ): + super().__init__() + if use_spatial_transformer: + assert context_dim is not None, 'Need to include the dimension of your cross-attention conditioning' + + if context_dim is not None: + assert use_spatial_transformer, 'Need to use the spatial transformer for your cross-attention conditioning' + from omegaconf.listconfig import ListConfig + if type(context_dim) == ListConfig: + context_dim = list(context_dim) + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + if num_heads == -1: + assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' + + if num_head_channels == -1: + assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' + + self.dims = dims + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + if isinstance(num_res_blocks, int): + self.num_res_blocks = len(channel_mult) * [num_res_blocks] + else: + if len(num_res_blocks) != len(channel_mult): + raise ValueError( + 'provide num_res_blocks either as an int (globally constant) or ' + 'as a list/tuple (per-level) with the same length as channel_mult' + ) + self.num_res_blocks = num_res_blocks + if disable_self_attentions is not None: + # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not + assert len(disable_self_attentions) == len(channel_mult) + if num_attention_blocks is not None: + assert len(num_attention_blocks) == len(self.num_res_blocks) + assert all( + map( + lambda i: self.num_res_blocks[i] >= num_attention_blocks[i + ], + range(len(num_attention_blocks)))) + print( + f'Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. ' + f'This option has LESS priority than attention_resolutions {attention_resolutions}, ' + f'i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, ' + f'attention will still not be set.') + + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.use_checkpoint = use_checkpoint + self.dtype = torch.float16 if use_fp16 else torch.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.predict_codebook_ids = n_embed is not None + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1)) + ]) + self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)]) + + self.input_hint_block = TimestepEmbedSequential( + conv_nd(dims, hint_channels, 16, 3, padding=1), nn.SiLU(), + conv_nd(dims, 16, 16, 3, padding=1), nn.SiLU(), + conv_nd(dims, 16, 32, 3, padding=1, stride=2), nn.SiLU(), + conv_nd(dims, 32, 32, 3, padding=1), nn.SiLU(), + conv_nd(dims, 32, 96, 3, padding=1, stride=2), nn.SiLU(), + conv_nd(dims, 96, 96, 3, padding=1), nn.SiLU(), + conv_nd(dims, 96, 256, 3, padding=1, stride=2), nn.SiLU(), + zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))) + + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for nr in range(self.num_res_blocks[level]): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks + ) or nr < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disabled_sa, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint)) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self.zero_convs.append(self.make_zero_conv(ch)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) if resblock_updown else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch)) + ) + ch = out_ch + input_block_chans.append(ch) + self.zero_convs.append(self.make_zero_conv(ch)) + ds *= 2 + self._feature_size += ch + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( # always uses a self-attn + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disable_middle_self_attn, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self.middle_block_out = self.make_zero_conv(ch) + self._feature_size += ch + + def make_zero_conv(self, channels): + return TimestepEmbedSequential( + zero_module(conv_nd(self.dims, channels, channels, 1, padding=0))) + + def forward(self, x, hint, timesteps, context, **kwargs): + t_emb = timestep_embedding( + timesteps, self.model_channels, repeat_only=False) + emb = self.time_embed(t_emb) # 1,1280 + + # 1,320,64,64 + guided_hint = self.input_hint_block(hint, emb, context) + outs = [] + + h = x.type(self.dtype) + for module, zero_conv in zip(self.input_blocks, self.zero_convs): + if guided_hint is not None: + # skip the first layer + h = guided_hint + guided_hint = None + else: + h_new = module(h, emb, context) + h = h_new + outs.append(zero_conv(h, emb, context)) + + h_new = self.middle_block(h, emb, context) + outs.append(self.middle_block_out(h_new, emb, context)) + return outs + + +@MODELS.register_module( + Tasks.image_to_image_generation, module_name=Models.anydoor) +class ControlLDM(LatentDiffusion, Model): + ''' + This work presents AnyDoor, a diffusion-based image generator + with the power to teleport target objects to new scenes + at user-specified locations in a harmonious way. + + Instead of tuning parameters for each object, our model + is trained only once and effortlessly generalizes + to diverse object-scene combinations at the inference stage. + + arxiv: https://arxiv.org/abs/2307.09481 + ''' + + def __init__(self, control_stage_config, control_key, only_mid_control, + *args, **kwargs): + super().__init__(*args, **kwargs) + self.control_model = ControlNet(**control_stage_config) + self.control_key = control_key + self.only_mid_control = only_mid_control + self.control_scales = [1.0] * 13 + + @torch.no_grad() + def get_input(self, batch, k, bs=None, *args, **kwargs): + x, c = super().get_input(batch, self.first_stage_key, *args, **kwargs) + control = batch[self.control_key] + if bs is not None: + control = control[:bs] + control = control.to(self.device) + control = einops.rearrange(control, 'b h w c -> b c h w') + control = control.to(memory_format=torch.contiguous_format).float() + self.time_steps = batch['time_steps'] + return x, dict(c_crossattn=[c], c_concat=[control]) + + def apply_model(self, x_noisy, t, cond, *args, **kwargs): + assert isinstance(cond, dict) + diffusion_model = self.model.diffusion_model + + cond_txt = torch.cat(cond['c_crossattn'], 1) + + if cond['c_concat'] is None: + eps = diffusion_model( + x=x_noisy, + timesteps=t, + context=cond_txt, + control=None, + only_mid_control=self.only_mid_control) + else: + control = self.control_model( + x=x_noisy, + hint=torch.cat(cond['c_concat'], 1), + timesteps=t, + context=cond_txt) + control = [ + c * scale for c, scale in zip(control, self.control_scales) + ] + eps = diffusion_model( + x=x_noisy, + timesteps=t, + context=cond_txt, + control=control, + only_mid_control=self.only_mid_control) + return eps + + @torch.no_grad() + def get_unconditional_conditioning(self, N): + uncond = self.get_learned_conditioning([torch.zeros( + (1, 3, 224, 224))] * N) + return uncond + + @torch.no_grad() + def log_images(self, + batch, + N=4, + n_row=2, + sample=False, + ddim_steps=50, + ddim_eta=0.0, + return_keys=None, + quantize_denoised=True, + inpaint=True, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=False, + unconditional_guidance_scale=9.0, + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs): + use_ddim = ddim_steps is not None + + log = dict() + z, c = self.get_input(batch, self.first_stage_key, bs=N) + c_cat, c = c['c_concat'][0][:N], c['c_crossattn'][0][:N] + N = min(z.shape[0], N) + n_row = min(z.shape[0], n_row) + log['reconstruction'] = self.decode_first_stage(z) + + # ==== visualize the shape mask or the high-frequency map ==== + guide_mask = (c_cat[:, -1, :, :].unsqueeze(1) + 1) * 0.5 + guide_mask = torch.cat([guide_mask, guide_mask, guide_mask], 1) + HF_map = c_cat[:, :3, :, :] # * 2.0 - 1.0 + + log['control'] = HF_map + + cond_image = batch[self.cond_stage_key].cpu().numpy().copy() + log['conditioning'] = torch.permute( + torch.tensor(cond_image), (0, 3, 1, 2)) * 2.0 - 1.0 + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack( + diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w') + diffusion_grid = rearrange(diffusion_grid, + 'b n c h w -> (b n) c h w') + diffusion_grid = make_grid( + diffusion_grid, nrow=diffusion_row.shape[0]) + log['diffusion_row'] = diffusion_grid + + if sample: + # get denoise row + samples, z_denoise_row = self.sample_log( + cond={ + 'c_concat': [c_cat], + 'c_crossattn': [c] + }, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta) + x_samples = self.decode_first_stage(samples) + log['samples'] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log['denoise_row'] = denoise_grid + + if unconditional_guidance_scale > 1.0: + uc_cross = self.get_unconditional_conditioning(N) + uc_cat = c_cat # torch.zeros_like(c_cat) + uc_full = {'c_concat': [uc_cat], 'c_crossattn': [uc_cross]} + samples_cfg, _ = self.sample_log( + cond={ + 'c_concat': [c_cat], + 'c_crossattn': [c] + }, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc_full, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg # * 2.0 - 1.0 + return log + + @torch.no_grad() + def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs): + ddim_sampler = DDIMSampler(self) + b, c, h, w = cond['c_concat'][0].shape + shape = (self.channels, h // 8, w // 8) + samples, intermediates = ddim_sampler.sample( + ddim_steps, batch_size, shape, cond, verbose=False, **kwargs) + return samples, intermediates + + def configure_optimizers(self): + lr = self.learning_rate + params = list(self.control_model.parameters()) + if not self.sd_locked: + params += list( + self.model.diffusion_model.output_blocks.parameters()) + params += list(self.model.diffusion_model.out.parameters()) + params += list(self.cond_stage_model.projector.parameters()) + opt = torch.optim.AdamW(params, lr=lr) + return opt + + def low_vram_shift(self, is_diffusing): + if is_diffusing: + self.model = self.model.cuda() + self.control_model = self.control_model.cuda() + self.first_stage_model = self.first_stage_model.cpu() + self.cond_stage_model = self.cond_stage_model.cpu() + else: + self.model = self.model.cpu() + self.control_model = self.control_model.cpu() + self.first_stage_model = self.first_stage_model.cuda() + self.cond_stage_model = self.cond_stage_model.cuda() diff --git a/modelscope/models/cv/anydoor/cldm/__init__.py b/modelscope/models/cv/anydoor/cldm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/cldm/ddim_hacked.py b/modelscope/models/cv/anydoor/cldm/ddim_hacked.py new file mode 100644 index 000000000..e6adf5716 --- /dev/null +++ b/modelscope/models/cv/anydoor/cldm/ddim_hacked.py @@ -0,0 +1,428 @@ +"""SAMPLING ONLY.""" + +import numpy as np +import torch +from tqdm import tqdm + +from ..ldm.modules.diffusionmodules.util import (extract_into_tensor, + make_ddim_sampling_parameters, + make_ddim_timesteps, + noise_like) + + +class DDIMSampler(object): + + def __init__(self, model, schedule='linear', **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device('cuda'): + attr = attr.to(torch.device('cuda')) + setattr(self, name, attr) + + def make_schedule(self, + ddim_num_steps, + ddim_discretize='uniform', + ddim_eta=0., + verbose=True): + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[ + 0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + + def to_torch(x): + return x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', + to_torch(self.model.alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', + to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', + to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', + np.sqrt(1. - ddim_alphas)) + tmp1 = (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) + tmp2 = (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(tmp1 * tmp2) + self.register_buffer('ddim_sigmas_for_original_num_steps', + sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample( + self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, # this has to come in the same format as the conditioning + dynamic_threshold=None, + ucg_schedule=None, + **kwargs): + if conditioning is not None: + if isinstance(conditioning, dict): + ctmp = conditioning[list(conditioning.keys())[0]] + while isinstance(ctmp, list): + ctmp = ctmp[0] + cbs = ctmp.shape[0] + if cbs != batch_size: + print( + f'Warning: Got {cbs} conditionings but batch-size is {batch_size}' + ) + + elif isinstance(conditioning, list): + for ctmp in conditioning: + if ctmp.shape[0] != batch_size: + print( + f'Warning: Got {cbs} conditionings but batch-size is {batch_size}' + ) + + else: + if conditioning.shape[0] != batch_size: + print( + f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}' + ) + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f'Data shape for DDIM sampling is {size}, eta {eta}') + + samples, intermediates = self.ddim_sampling( + conditioning, + size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, + x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ucg_schedule=ucg_schedule) + return samples, intermediates + + @torch.no_grad() + def ddim_sampling(self, + cond, + shape, + x_T=None, + ddim_use_original_steps=False, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + log_every_t=100, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + dynamic_threshold=None, + ucg_schedule=None): + device = self.model.betas.device + b = shape[0] + # x_T 1,4,64,64 + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int( + min(timesteps / self.ddim_timesteps.shape[0], 1) + * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {'x_inter': [img], 'pred_x0': [img]} + time_range = reversed(range( + 0, timesteps)) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[ + 0] + print(f'Running DDIM Sampling with {total_steps} timesteps') + + iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b, ), step, device=device, dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample( + x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img + + if ucg_schedule is not None: + assert len(ucg_schedule) == len(time_range) + unconditional_guidance_scale = ucg_schedule[i] + + outs = self.p_sample_ddim( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold) + img, pred_x0 = outs + if callback: + callback(i) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(img) + intermediates['pred_x0'].append(pred_x0) + + return img, intermediates + + @torch.no_grad() + def p_sample_ddim(self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + dynamic_threshold=None): + b, *_, device = *x.shape, x.device + + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + model_output = self.model.apply_model(x, t, c) + else: + model_t = self.model.apply_model(x, t, c) + model_uncond = self.model.apply_model(x, t, + unconditional_conditioning) + model_output = model_uncond + unconditional_guidance_scale * ( + model_t - model_uncond) + + if self.model.parameterization == 'v': + e_t = self.model.predict_eps_from_z_and_v(x, t, model_output) + else: + e_t = model_output + + if score_corrector is not None: + assert self.model.parameterization == 'eps', 'not implemented' + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, + **corrector_kwargs) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod \ + if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full((b, 1, 1, 1), + sqrt_one_minus_alphas[index], + device=device) + + # current prediction for x_0 + if self.model.parameterization != 'v': + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + else: + pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output) + + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + + if dynamic_threshold is not None: + raise NotImplementedError() + + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, + repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + @torch.no_grad() + def encode(self, + x0, + c, + t_enc, + use_original_steps=False, + return_intermediates=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + callback=None): + timesteps = np.arange(self.ddpm_num_timesteps + ) if use_original_steps else self.ddim_timesteps + num_reference_steps = timesteps.shape[0] + + assert t_enc <= num_reference_steps + num_steps = t_enc + + if use_original_steps: + alphas_next = self.alphas_cumprod[:num_steps] + alphas = self.alphas_cumprod_prev[:num_steps] + else: + alphas_next = self.ddim_alphas[:num_steps] + alphas = torch.tensor(self.ddim_alphas_prev[:num_steps]) + + x_next = x0 + intermediates = [] + inter_steps = [] + for i in tqdm(range(num_steps), desc='Encoding Image'): + t = torch.full((x0.shape[0], ), + timesteps[i], + device=self.model.device, + dtype=torch.long) + if unconditional_guidance_scale == 1.: + noise_pred = self.model.apply_model(x_next, t, c) + else: + assert unconditional_conditioning is not None + e_t_uncond, noise_pred = torch.chunk( + self.model.apply_model( + torch.cat((x_next, x_next)), torch.cat((t, t)), + torch.cat((unconditional_conditioning, c))), 2) + noise_pred = e_t_uncond + unconditional_guidance_scale * ( + noise_pred - e_t_uncond) + + xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next + tmp = (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt() + weighted_noise_pred = alphas_next[i].sqrt() * tmp * noise_pred + x_next = xt_weighted + weighted_noise_pred + if return_intermediates and i % (num_steps // return_intermediates + ) == 0 and i < num_steps - 1: + intermediates.append(x_next) + inter_steps.append(i) + elif return_intermediates and i >= num_steps - 2: + intermediates.append(x_next) + inter_steps.append(i) + if callback: + callback(i) + + out = {'x_encoded': x_next, 'intermediate_steps': inter_steps} + if return_intermediates: + out.update({'intermediates': intermediates}) + return x_next, out + + @torch.no_grad() + def stochastic_encode(self, x0, t, use_original_steps=False, noise=None): + # fast, but does not allow for exact reconstruction + # t serves as an index to gather the correct alphas + if use_original_steps: + sqrt_alphas_cumprod = self.sqrt_alphas_cumprod + sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod + else: + sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas) + sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas + + if noise is None: + noise = torch.randn_like(x0) + return ( + extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 + + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) + * noise) + + @torch.no_grad() + def decode(self, + x_latent, + cond, + t_start, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + use_original_steps=False, + callback=None): + + timesteps = np.arange(self.ddpm_num_timesteps + ) if use_original_steps else self.ddim_timesteps + timesteps = timesteps[:t_start] + + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + print(f'Running DDIM Sampling with {total_steps} timesteps') + + iterator = tqdm(time_range, desc='Decoding image', total=total_steps) + x_dec = x_latent + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((x_latent.shape[0], ), + step, + device=x_latent.device, + dtype=torch.long) + x_dec, _ = self.p_sample_ddim( + x_dec, + cond, + ts, + index=index, + use_original_steps=use_original_steps, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning) + if callback: + callback(i) + return x_dec diff --git a/modelscope/models/cv/anydoor/datasets/__init__.py b/modelscope/models/cv/anydoor/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/datasets/data_utils.py b/modelscope/models/cv/anydoor/datasets/data_utils.py new file mode 100644 index 000000000..82d41b1cf --- /dev/null +++ b/modelscope/models/cv/anydoor/datasets/data_utils.py @@ -0,0 +1,364 @@ +import cv2 +import numpy as np +import torch + + +def mask_score(mask): + '''Scoring the mask according to connectivity.''' + mask = mask.astype(np.uint8) + if mask.sum() < 10: + return 0 + contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, + cv2.CHAIN_APPROX_NONE) + cnt_area = [cv2.contourArea(cnt) for cnt in contours] + conc_score = np.max(cnt_area) / sum(cnt_area) + return conc_score + + +def sobel(img, mask, thresh=50): + '''Calculating the high-frequency map.''' + H, W = img.shape[0], img.shape[1] + img = cv2.resize(img, (256, 256)) + mask = (cv2.resize(mask, (256, 256)) > 0.5).astype(np.uint8) + kernel = np.ones((5, 5), np.uint8) + mask = cv2.erode(mask, kernel, iterations=2) + + Ksize = 3 + sobelx = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=Ksize) + sobely = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=Ksize) + sobel_X = cv2.convertScaleAbs(sobelx) + sobel_Y = cv2.convertScaleAbs(sobely) + scharr = cv2.addWeighted(sobel_X, 0.5, sobel_Y, 0.5, 0) + scharr = np.max(scharr, -1) * mask + + scharr[scharr < thresh] = 0.0 + scharr = np.stack([scharr, scharr, scharr], -1) + scharr = (scharr.astype(np.float32) / 255 * img.astype(np.float32)).astype( + np.uint8) + scharr = cv2.resize(scharr, (W, H)) + return scharr + + +def resize_and_pad(image, box): + '''Fitting an image to the box region while keeping the aspect ratio.''' + y1, y2, x1, x2 = box + H, W = y2 - y1, x2 - x1 + h, w = image.shape[0], image.shape[1] + r_box = W / H + r_image = w / h + if r_box >= r_image: + h_target = H + w_target = int(w * H / h) + image = cv2.resize(image, (w_target, h_target)) + + w1 = (W - w_target) // 2 + w2 = W - w_target - w1 + pad_param = ((0, 0), (w1, w2), (0, 0)) + image = np.pad(image, pad_param, 'constant', constant_values=255) + else: + w_target = W + h_target = int(h * W / w) + image = cv2.resize(image, (w_target, h_target)) + + h1 = (H - h_target) // 2 + h2 = H - h_target - h1 + pad_param = ((h1, h2), (0, 0), (0, 0)) + image = np.pad(image, pad_param, 'constant', constant_values=255) + return image + + +def expand_image_mask(image, mask, ratio=1.4): + h, w = image.shape[0], image.shape[1] + H, W = int(h * ratio), int(w * ratio) + h1 = int((H - h) // 2) + h2 = H - h - h1 + w1 = int((W - w) // 2) + w2 = W - w - w1 + + pad_param_image = ((h1, h2), (w1, w2), (0, 0)) + pad_param_mask = ((h1, h2), (w1, w2)) + image = np.pad(image, pad_param_image, 'constant', constant_values=255) + mask = np.pad(mask, pad_param_mask, 'constant', constant_values=0) + return image, mask + + +def resize_box(yyxx, H, W, h, w): + y1, y2, x1, x2 = yyxx + y1, y2 = int(y1 / H * h), int(y2 / H * h) + x1, x2 = int(x1 / W * w), int(x2 / W * w) + y1, y2 = min(y1, h), min(y2, h) + x1, x2 = min(x1, w), min(x2, w) + return (y1, y2, x1, x2) + + +def get_bbox_from_mask(mask): + h, w = mask.shape[0], mask.shape[1] + + if mask.sum() < 10: + return 0, h, 0, w + rows = np.any(mask, axis=1) + cols = np.any(mask, axis=0) + y1, y2 = np.where(rows)[0][[0, -1]] + x1, x2 = np.where(cols)[0][[0, -1]] + return (y1, y2, x1, x2) + + +def expand_bbox(mask, yyxx, ratio=[1.2, 2.0], min_crop=0): + y1, y2, x1, x2 = yyxx + ratio = np.random.randint(ratio[0] * 10, ratio[1] * 10) / 10 + H, W = mask.shape[0], mask.shape[1] + xc, yc = 0.5 * (x1 + x2), 0.5 * (y1 + y2) + h = ratio * (y2 - y1 + 1) + w = ratio * (x2 - x1 + 1) + h = max(h, min_crop) + w = max(w, min_crop) + + x1 = int(xc - w * 0.5) + x2 = int(xc + w * 0.5) + y1 = int(yc - h * 0.5) + y2 = int(yc + h * 0.5) + + x1 = max(0, x1) + x2 = min(W, x2) + y1 = max(0, y1) + y2 = min(H, y2) + return (y1, y2, x1, x2) + + +def box2squre(image, box): + H, W = image.shape[0], image.shape[1] + y1, y2, x1, x2 = box + cx = (x1 + x2) // 2 + cy = (y1 + y2) // 2 + h, w = y2 - y1, x2 - x1 + + if h >= w: + x1 = cx - h // 2 + x2 = cx + h // 2 + else: + y1 = cy - w // 2 + y2 = cy + w // 2 + x1 = max(0, x1) + x2 = min(W, x2) + y1 = max(0, y1) + y2 = min(H, y2) + return (y1, y2, x1, x2) + + +def pad_to_square(image, pad_value=255, random=False): + H, W = image.shape[0], image.shape[1] + if H == W: + return image + + padd = abs(H - W) + if random: + padd_1 = int(np.random.randint(0, padd)) + else: + padd_1 = int(padd / 2) + padd_2 = padd - padd_1 + + if H > W: + pad_param = ((0, 0), (padd_1, padd_2), (0, 0)) + else: + pad_param = ((padd_1, padd_2), (0, 0), (0, 0)) + + image = np.pad(image, pad_param, 'constant', constant_values=pad_value) + return image + + +def box_in_box(small_box, big_box): + y1, y2, x1, x2 = small_box + y1_b, _, x1_b, _ = big_box + y1, y2, x1, x2 = y1 - y1_b, y2 - y1_b, x1 - x1_b, x2 - x1_b + return (y1, y2, x1, x2) + + +def shuffle_image(image, N): + height, width = image.shape[:2] + + block_height = height // N + block_width = width // N + blocks = [] + + for i in range(N): + for j in range(N): + block = image[i * block_height:(i + 1) * block_height, + j * block_width:(j + 1) * block_width] + blocks.append(block) + + np.random.shuffle(blocks) + shuffled_image = np.zeros((height, width, 3), dtype=np.uint8) + + for i in range(N): + for j in range(N): + shuffled_image[i * block_height:(i + 1) * block_height, + j * block_width:(j + 1) + * block_width] = blocks[i * N + j] + return shuffled_image + + +def get_mosaic_mask(image, fg_mask, N=16, ratio=0.5): + ids = [i for i in range(N * N)] + masked_number = int(N * N * ratio) + masked_id = np.random.choice(ids, masked_number, replace=False) + + height, width = image.shape[:2] + mask = np.ones((height, width)) + + block_height = height // N + block_width = width // N + + b_id = 0 + for i in range(N): + for j in range(N): + if b_id in masked_id: + mask[i * block_height:(i + 1) * block_height, + j * block_width:(j + 1) + * block_width] = mask[i * block_height:(i + 1) + * block_height, j * block_width: + (j + 1) * block_width] * 0 + b_id += 1 + mask = mask * fg_mask + mask3 = np.stack([mask, mask, mask], -1).copy().astype(np.uint8) + noise = q_x(image) + noise_mask = image * mask3 + noise * (1 - mask3) + return noise_mask + + +def extract_canny_noise(image, mask, dilate=True): + h, w = image.shape[0], image.shape[1] + mask = cv2.resize(mask.astype(np.uint8), (w, h)) > 0.5 + kernel = np.ones((8, 8), dtype=np.uint8) + mask = cv2.erode(mask.astype(np.uint8), kernel, 10) + + canny = cv2.Canny(image, 50, 100) * mask + kernel = np.ones((8, 8), dtype=np.uint8) + mask = (cv2.dilate(canny, kernel, 5) > 128).astype(np.uint8) + mask = np.stack([mask, mask, mask], -1) + + pure_noise = q_x(image, t=1) * 0 + 255 + canny_noise = mask * image + (1 - mask) * pure_noise + return canny_noise + + +def get_random_structure(size): + choice = np.random.randint(1, 5) + + if choice == 1: + return cv2.getStructuringElement(cv2.MORPH_RECT, (size, size)) + elif choice == 2: + return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size, size)) + elif choice == 3: + return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size, size // 2)) + elif choice == 4: + return cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (size // 2, size)) + + +def random_dilate(seg, min=3, max=10): + size = np.random.randint(min, max) + kernel = get_random_structure(size) + seg = cv2.dilate(seg, kernel, iterations=1) + return seg + + +def random_erode(seg, min=3, max=10): + size = np.random.randint(min, max) + kernel = get_random_structure(size) + seg = cv2.erode(seg, kernel, iterations=1) + return seg + + +def compute_iou(seg, gt): + intersection = seg * gt + union = seg + gt + return (np.count_nonzero(intersection) + 1e-6) / ( + np.count_nonzero(union) + 1e-6) + + +def select_max_region(mask): + nums, labels, stats, centroids = cv2.connectedComponentsWithStats( + mask, connectivity=8) + background = 0 + for row in range(stats.shape[0]): + if stats[row, :][0] == 0 and stats[row, :][1] == 0: + background = row + stats_no_bg = np.delete(stats, background, axis=0) + max_idx = stats_no_bg[:, 4].argmax() + max_region = np.where(labels == max_idx + 1, 1, 0) + + return max_region.astype(np.uint8) + + +def perturb_mask(gt, min_iou=0.3, max_iou=0.99): + iou_target = np.random.uniform(min_iou, max_iou) + h, w = gt.shape + gt = gt.astype(np.uint8) + seg = gt.copy() + + # Rare case + if h <= 2 or w <= 2: + print('GT too small, returning original') + return seg + + # Do a bunch of random operations + for _ in range(250): + for _ in range(4): + lx, ly = np.random.randint(w), np.random.randint(h) + lw, lh = np.random.randint(lx + 1, w + 1), np.random.randint( + ly + 1, h + 1) + + # Randomly set one pixel to 1/0. With the following dilate/erode, we can create holes/external regions + if np.random.rand() < 0.1: + cx = int((lx + lw) / 2) + cy = int((ly + lh) / 2) + seg[cy, cx] = np.random.randint(2) * 255 + + # Dilate/erode + if np.random.rand() < 0.5: + seg[ly:lh, lx:lw] = random_dilate(seg[ly:lh, lx:lw]) + else: + seg[ly:lh, lx:lw] = random_erode(seg[ly:lh, lx:lw]) + + seg = np.logical_or(seg, gt).astype(np.uint8) + # seg = select_max_region(seg) + + if compute_iou(seg, gt) < iou_target: + break + seg = select_max_region(seg.astype(np.uint8)) + return seg.astype(np.uint8) + + +def q_x(x_0, t=65): + '''Adding noise for and given image.''' + x_0 = torch.from_numpy(x_0).float() / 127.5 - 1 + num_steps = 100 + + betas = torch.linspace(-6, 6, num_steps) + betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5 + + alphas = 1 - betas + alphas_prod = torch.cumprod(alphas, 0) + + alphas_bar_sqrt = torch.sqrt(alphas_prod) + one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod) + + noise = torch.randn_like(x_0) + alphas_t = alphas_bar_sqrt[t] + alphas_1_m_t = one_minus_alphas_bar_sqrt[t] + return (alphas_t * x_0 + alphas_1_m_t * noise).numpy() * 127.5 + 127.5 + + +def extract_target_boundary(img, target_mask): + Ksize = 3 + sobelx = cv2.Sobel(img, cv2.CV_64F, 1, 0, ksize=Ksize) + sobely = cv2.Sobel(img, cv2.CV_64F, 0, 1, ksize=Ksize) + + # sobel-x + sobel_X = cv2.convertScaleAbs(sobelx) + # sobel-y + sobel_Y = cv2.convertScaleAbs(sobely) + # sobel-xy + scharr = cv2.addWeighted(sobel_X, 0.5, sobel_Y, 0.5, 0) + scharr = np.max(scharr, -1).astype(np.float32) / 255 + scharr = scharr * target_mask.astype(np.float32) + return scharr diff --git a/modelscope/models/cv/anydoor/dinov2/__init__.py b/modelscope/models/cv/anydoor/dinov2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/__init__.py b/modelscope/models/cv/anydoor/dinov2/dinov2/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/__init__.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/__init__.py new file mode 100644 index 000000000..daadf5eb3 --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/__init__.py @@ -0,0 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .attention import MemEffAttention +from .block import NestedTensorBlock +from .dino_head import DINOHead +from .mlp import Mlp +from .patch_embed import PatchEmbed +from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/attention.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/attention.py new file mode 100644 index 000000000..2efee7368 --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/attention.py @@ -0,0 +1,86 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging + +from torch import Tensor, nn + +logger = logging.getLogger('dinov2') + +try: + from xformers.ops import memory_efficient_attention, unbind, fmha + + XFORMERS_AVAILABLE = True +except ImportError: + logger.warning('xFormers not available') + XFORMERS_AVAILABLE = False + + +class Attention(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = False, + proj_bias: bool = True, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + ) -> None: + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim, bias=proj_bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x: Tensor) -> Tensor: + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + attn = q @ k.transpose(-2, -1) + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class MemEffAttention(Attention): + + def forward(self, x: Tensor, attn_bias=None) -> Tensor: + if not XFORMERS_AVAILABLE: + assert attn_bias is None, 'xFormers is required for nested tensors usage' + return super().forward(x) + + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads) + + q, k, v = unbind(qkv, 2) + + if attn_bias is not None: + self_att_op = fmha.MemoryEfficientAttentionFlashAttentionOp + else: + self_att_op = None + x = memory_efficient_attention( + q, k, v, attn_bias=attn_bias, op=self_att_op) + x = x.reshape([B, N, C]) + + x = self.proj(x) + x = self.proj_drop(x) + return x diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/block.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/block.py new file mode 100644 index 000000000..f9f1f9caf --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/block.py @@ -0,0 +1,286 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +import logging +from typing import Any, Callable, Dict, List, Tuple + +import torch +from torch import Tensor, nn + +from .attention import Attention, MemEffAttention +from .drop_path import DropPath +from .layer_scale import LayerScale +from .mlp import Mlp + +logger = logging.getLogger('dinov2') + +try: + from xformers.ops import fmha + from xformers.ops import scaled_index_add, index_select_cat + + XFORMERS_AVAILABLE = True +except ImportError: + logger.warning('xFormers not available') + XFORMERS_AVAILABLE = False + + +class Block(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = False, + proj_bias: bool = True, + ffn_bias: bool = True, + drop: float = 0.0, + attn_drop: float = 0.0, + init_values=None, + drop_path: float = 0.0, + act_layer: Callable[..., nn.Module] = nn.GELU, + norm_layer: Callable[..., nn.Module] = nn.LayerNorm, + attn_class: Callable[..., nn.Module] = Attention, + ffn_layer: Callable[..., nn.Module] = Mlp, + ) -> None: + super().__init__() + # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}") + self.norm1 = norm_layer(dim) + self.attn = attn_class( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.ls1 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path1 = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = ffn_layer( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + bias=ffn_bias, + ) + self.ls2 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath( + drop_path) if drop_path > 0.0 else nn.Identity() + + self.sample_drop_ratio = drop_path + + def forward(self, x: Tensor) -> Tensor: + + def attn_residual_func(x: Tensor) -> Tensor: + return self.ls1(self.attn(self.norm1(x))) + + def ffn_residual_func(x: Tensor) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + if self.training and self.sample_drop_ratio > 0.1: + # the overhead is compensated only for a drop path rate larger than 0.1 + x = drop_add_residual_stochastic_depth( + x, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + x = drop_add_residual_stochastic_depth( + x, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + ) + elif self.training and self.sample_drop_ratio > 0.0: + x = x + self.drop_path1(attn_residual_func(x)) + x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2 + else: + x = x + attn_residual_func(x) + x = x + ffn_residual_func(x) + return x + + +def drop_add_residual_stochastic_depth( + x: Tensor, + residual_func: Callable[[Tensor], Tensor], + sample_drop_ratio: float = 0.0, +) -> Tensor: + # 1) extract subset using permutation + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + x_subset = x[brange] + + # 2) apply residual_func to get residual + residual = residual_func(x_subset) + + x_flat = x.flatten(1) + residual = residual.flatten(1) + + residual_scale_factor = b / sample_subset_size + + # 3) add the residual + x_plus_residual = torch.index_add( + x_flat, + 0, + brange, + residual.to(dtype=x.dtype), + alpha=residual_scale_factor) + return x_plus_residual.view_as(x) + + +def get_branges_scales(x, sample_drop_ratio=0.0): + b, n, d = x.shape + sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1) + brange = (torch.randperm(b, device=x.device))[:sample_subset_size] + residual_scale_factor = b / sample_subset_size + return brange, residual_scale_factor + + +def add_residual(x, + brange, + residual, + residual_scale_factor, + scaling_vector=None): + if scaling_vector is None: + x_flat = x.flatten(1) + residual = residual.flatten(1) + x_plus_residual = torch.index_add( + x_flat, + 0, + brange, + residual.to(dtype=x.dtype), + alpha=residual_scale_factor) + else: + x_plus_residual = scaled_index_add( + x, + brange, + residual.to(dtype=x.dtype), + scaling=scaling_vector, + alpha=residual_scale_factor) + return x_plus_residual + + +attn_bias_cache: Dict[Tuple, Any] = {} + + +def get_attn_bias_and_cat(x_list, branges=None): + """ + this will perform the index select, cat the tensors, and provide the attn_bias from cache + """ + batch_sizes = [b.shape[0] for b in branges + ] if branges is not None else [x.shape[0] for x in x_list] + all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list)) + if all_shapes not in attn_bias_cache.keys(): + seqlens = [] + for b, x in zip(batch_sizes, x_list): + for _ in range(b): + seqlens.append(x.shape[1]) + attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens) + attn_bias._batch_sizes = batch_sizes + attn_bias_cache[all_shapes] = attn_bias + + if branges is not None: + cat_tensors = index_select_cat([x.flatten(1) for x in x_list], + branges).view(1, -1, + x_list[0].shape[-1]) + else: + tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list) + cat_tensors = torch.cat(tensors_bs1, dim=1) + + return attn_bias_cache[all_shapes], cat_tensors + + +def drop_add_residual_stochastic_depth_list( + x_list: List[Tensor], + residual_func: Callable[[Tensor, Any], Tensor], + sample_drop_ratio: float = 0.0, + scaling_vector=None, +) -> Tensor: + # 1) generate random set of indices for dropping samples in the batch + branges_scales = [ + get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) + for x in x_list + ] + branges = [s[0] for s in branges_scales] + residual_scale_factors = [s[1] for s in branges_scales] + + # 2) get attention bias and index+concat the tensors + attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges) + + # 3) apply residual_func to get residual, and split the result + residual_list = attn_bias.split(residual_func( + x_cat, attn_bias=attn_bias)) # type: ignore + + outputs = [] + for x, brange, residual, residual_scale_factor in zip( + x_list, branges, residual_list, residual_scale_factors): + outputs.append( + add_residual(x, brange, residual, residual_scale_factor, + scaling_vector).view_as(x)) + return outputs + + +class NestedTensorBlock(Block): + + def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]: + """ + x_list contains a list of tensors to nest together and run + """ + assert isinstance(self.attn, MemEffAttention) + + if self.training and self.sample_drop_ratio > 0.0: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.attn(self.norm1(x), attn_bias=attn_bias) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.mlp(self.norm2(x)) + + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=attn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls1.gamma if isinstance( + self.ls1, LayerScale) else None, + ) + x_list = drop_add_residual_stochastic_depth_list( + x_list, + residual_func=ffn_residual_func, + sample_drop_ratio=self.sample_drop_ratio, + scaling_vector=self.ls2.gamma if isinstance( + self.ls1, LayerScale) else None, + ) + return x_list + else: + + def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias)) + + def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor: + return self.ls2(self.mlp(self.norm2(x))) + + attn_bias, x = get_attn_bias_and_cat(x_list) + x = x + attn_residual_func(x, attn_bias=attn_bias) + x = x + ffn_residual_func(x) + return attn_bias.split(x) + + def forward(self, x_or_x_list): + if isinstance(x_or_x_list, Tensor): + return super().forward(x_or_x_list) + elif isinstance(x_or_x_list, list): + assert XFORMERS_AVAILABLE, 'Please install xFormers for nested tensors usage' + return self.forward_nested(x_or_x_list) + else: + raise AssertionError diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/dino_head.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/dino_head.py new file mode 100644 index 000000000..72a21386f --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/dino_head.py @@ -0,0 +1,72 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from torch.nn.init import trunc_normal_ +from torch.nn.utils import weight_norm + + +class DINOHead(nn.Module): + + def __init__( + self, + in_dim, + out_dim, + use_bn=False, + nlayers=3, + hidden_dim=2048, + bottleneck_dim=256, + mlp_bias=True, + ): + super().__init__() + nlayers = max(nlayers, 1) + self.mlp = _build_mlp( + nlayers, + in_dim, + bottleneck_dim, + hidden_dim=hidden_dim, + use_bn=use_bn, + bias=mlp_bias) + self.apply(self._init_weights) + self.last_layer = weight_norm( + nn.Linear(bottleneck_dim, out_dim, bias=False)) + self.last_layer.weight_g.data.fill_(1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + def forward(self, x): + x = self.mlp(x) + eps = 1e-6 if x.dtype == torch.float16 else 1e-12 + x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) + x = self.last_layer(x) + return x + + +def _build_mlp(nlayers, + in_dim, + bottleneck_dim, + hidden_dim=None, + use_bn=False, + bias=True): + if nlayers == 1: + return nn.Linear(in_dim, bottleneck_dim, bias=bias) + else: + layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + for _ in range(nlayers - 2): + layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) + if use_bn: + layers.append(nn.BatchNorm1d(hidden_dim)) + layers.append(nn.GELU()) + layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) + return nn.Sequential(*layers) diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/drop_path.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/drop_path.py new file mode 100644 index 000000000..d28930e1e --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/drop_path.py @@ -0,0 +1,35 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py + +from torch import nn + + +def drop_path(x, drop_prob: float = 0.0, training: bool = False): + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0], ) + (1, ) * ( + x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0: + random_tensor.div_(keep_prob) + output = x * random_tensor + return output + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/layer_scale.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/layer_scale.py new file mode 100644 index 000000000..c84e741a1 --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/layer_scale.py @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Union + +import torch +from torch import Tensor, nn + + +class LayerScale(nn.Module): + + def __init__( + self, + dim: int, + init_values: Union[float, Tensor] = 1e-5, + inplace: bool = False, + ) -> None: + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x: Tensor) -> Tensor: + return x.mul_(self.gamma) if self.inplace else x * self.gamma diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/mlp.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/mlp.py new file mode 100644 index 000000000..68a286b73 --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/mlp.py @@ -0,0 +1,41 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py + +from typing import Callable, Optional + +from torch import Tensor, nn + + +class Mlp(nn.Module): + + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = nn.GELU, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) + self.drop = nn.Dropout(drop) + + def forward(self, x: Tensor) -> Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/patch_embed.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/patch_embed.py new file mode 100644 index 000000000..ec5aa7521 --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/patch_embed.py @@ -0,0 +1,91 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/master/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py + +from typing import Callable, Optional, Tuple, Union + +import torch.nn as nn +from torch import Tensor + + +def make_2tuple(x): + if isinstance(x, tuple): + assert len(x) == 2 + return x + + assert isinstance(x, int) + return (x, x) + + +class PatchEmbed(nn.Module): + """ + 2D image to patch embedding: (B,C,H,W) -> (B,N,D) + + Args: + img_size: Image size. + patch_size: Patch token size. + in_chans: Number of input image channels. + embed_dim: Number of linear projection output channels. + norm_layer: Normalization layer. + """ + + def __init__( + self, + img_size: Union[int, Tuple[int, int]] = 224, + patch_size: Union[int, Tuple[int, int]] = 16, + in_chans: int = 3, + embed_dim: int = 768, + norm_layer: Optional[Callable] = None, + flatten_embedding: bool = True, + ) -> None: + super().__init__() + + image_HW = make_2tuple(img_size) + patch_HW = make_2tuple(patch_size) + patch_grid_size = ( + image_HW[0] // patch_HW[0], + image_HW[1] // patch_HW[1], + ) + + self.img_size = image_HW + self.patch_size = patch_HW + self.patches_resolution = patch_grid_size + self.num_patches = patch_grid_size[0] * patch_grid_size[1] + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.flatten_embedding = flatten_embedding + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x: Tensor) -> Tensor: + _, _, H, W = x.shape + patch_H, patch_W = self.patch_size + + assert H % patch_H == 0, f'Input image height {H} is not a multiple of patch height {patch_H}' + assert W % patch_W == 0, f'Input image width {W} is not a multiple of patch width: {patch_W}' + + x = self.proj(x) # B C H W + H, W = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) # B HW C + x = self.norm(x) + if not self.flatten_embedding: + x = x.reshape(-1, H, W, self.embed_dim) # B H W C + return x + + def flops(self) -> float: + Ho, Wo = self.patches_resolution + flops = Ho * Wo * self.embed_dim * self.in_chans * ( + self.patch_size[0] * self.patch_size[1]) + if self.norm is not None: + flops += Ho * Wo * self.embed_dim + return flops diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/layers/swiglu_ffn.py b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/swiglu_ffn.py new file mode 100644 index 000000000..b6c593f7a --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/layers/swiglu_ffn.py @@ -0,0 +1,65 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Callable, Optional + +import torch.nn.functional as F +from torch import Tensor, nn + + +class SwiGLUFFN(nn.Module): + + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias) + self.w3 = nn.Linear(hidden_features, out_features, bias=bias) + + def forward(self, x: Tensor) -> Tensor: + x12 = self.w12(x) + x1, x2 = x12.chunk(2, dim=-1) + hidden = F.silu(x1) * x2 + return self.w3(hidden) + + +try: + from xformers.ops import SwiGLU + + XFORMERS_AVAILABLE = True +except ImportError: + SwiGLU = SwiGLUFFN + XFORMERS_AVAILABLE = False + + +class SwiGLUFFNFused(SwiGLU): + + def __init__( + self, + in_features: int, + hidden_features: Optional[int] = None, + out_features: Optional[int] = None, + act_layer: Callable[..., nn.Module] = None, + drop: float = 0.0, + bias: bool = True, + ) -> None: + out_features = out_features or in_features + hidden_features = hidden_features or in_features + hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8 + super().__init__( + in_features=in_features, + hidden_features=hidden_features, + out_features=out_features, + bias=bias, + ) diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/models/__init__.py b/modelscope/models/cv/anydoor/dinov2/dinov2/models/__init__.py new file mode 100644 index 000000000..4d8b4118a --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/models/__init__.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from . import vision_transformer as vits + +logger = logging.getLogger('dinov2') + + +def build_model(args, only_teacher=False, img_size=224): + args.arch = args.arch.removesuffix('_memeff') + if 'vit' in args.arch: + vit_kwargs = dict( + img_size=img_size, + patch_size=args.patch_size, + init_values=args.layerscale, + ffn_layer=args.ffn_layer, + block_chunks=args.block_chunks, + qkv_bias=args.qkv_bias, + proj_bias=args.proj_bias, + ffn_bias=args.ffn_bias, + ) + teacher = vits.__dict__[args.arch](**vit_kwargs) + if only_teacher: + return teacher, teacher.embed_dim + student = vits.__dict__[args.arch]( + **vit_kwargs, + drop_path_rate=args.drop_path_rate, + drop_path_uniform=args.drop_path_uniform, + ) + embed_dim = student.embed_dim + return student, teacher, embed_dim + + +def build_model_from_cfg(cfg, only_teacher=False): + return build_model( + cfg.student, + only_teacher=only_teacher, + img_size=cfg.crops.global_crops_size) diff --git a/modelscope/models/cv/anydoor/dinov2/dinov2/models/vision_transformer.py b/modelscope/models/cv/anydoor/dinov2/dinov2/models/vision_transformer.py new file mode 100644 index 000000000..2c9c6ec96 --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/dinov2/models/vision_transformer.py @@ -0,0 +1,390 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +# References: +# https://github.com/facebookresearch/dino/blob/main/vision_transformer.py +# https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py + +import logging +import math +from functools import partial +from typing import Callable, Sequence, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint +from torch.nn.init import trunc_normal_ + +from ..layers import MemEffAttention, Mlp +from ..layers import NestedTensorBlock as Block +from ..layers import PatchEmbed, SwiGLUFFNFused + +logger = logging.getLogger('dinov2') + + +def named_apply(fn: Callable, + module: nn.Module, + name='', + depth_first=True, + include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = '.'.join((name, child_name)) if name else child_name + named_apply( + fn=fn, + module=child_module, + name=child_name, + depth_first=depth_first, + include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +class BlockChunk(nn.ModuleList): + + def forward(self, x): + for b in self: + x = b(x) + return x + + +class DinoVisionTransformer(nn.Module): + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4.0, + qkv_bias=True, + ffn_bias=True, + proj_bias=True, + drop_path_rate=0.0, + drop_path_uniform=False, + init_values=None, # for layerscale: None or 0 => no layerscale + embed_layer=PatchEmbed, + act_layer=nn.GELU, + block_fn=Block, + ffn_layer='mlp', + block_chunks=1, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + proj_bias (bool): enable bias for proj in attn if True + ffn_bias (bool): enable bias for ffn if True + drop_path_rate (float): stochastic depth rate + drop_path_uniform (bool): apply uniform drop rate across blocks + weight_init (str): weight init scheme + init_values (float): layer-scale init values + embed_layer (nn.Module): patch embedding layer + act_layer (nn.Module): MLP activation layer + block_fn (nn.Module): transformer block class + ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity" + block_chunks: (int) split block sequence into block_chunks units for FSDP wrap + """ + super().__init__() + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_tokens = 1 + self.n_blocks = depth + self.num_heads = num_heads + self.patch_size = patch_size + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches + self.num_tokens, embed_dim)) + + if drop_path_uniform is True: + dpr = [drop_path_rate] * depth + else: + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + if ffn_layer == 'mlp': + logger.info('using MLP layer as FFN') + ffn_layer = Mlp + elif ffn_layer == 'swiglufused' or ffn_layer == 'swiglu': + logger.info('using SwiGLU layer as FFN') + ffn_layer = SwiGLUFFNFused + elif ffn_layer == 'identity': + logger.info('using Identity layer as FFN') + + def f(*args, **kwargs): + return nn.Identity() + + ffn_layer = f + else: + raise NotImplementedError + + blocks_list = [ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + proj_bias=proj_bias, + ffn_bias=ffn_bias, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer, + ffn_layer=ffn_layer, + init_values=init_values, + ) for i in range(depth) + ] + if block_chunks > 0: + self.chunked_blocks = True + chunked_blocks = [] + chunksize = depth // block_chunks + for i in range(0, depth, chunksize): + # this is to keep the block index consistent if we chunk the block list + chunked_blocks.append([nn.Identity()] * i + + blocks_list[i:i + chunksize]) + self.blocks = nn.ModuleList( + [BlockChunk(p) for p in chunked_blocks]) + else: + self.chunked_blocks = False + self.blocks = nn.ModuleList(blocks_list) + + self.norm = norm_layer(embed_dim) + self.head = nn.Identity() + + self.mask_token = nn.Parameter(torch.zeros(1, embed_dim)) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.pos_embed, std=0.02) + nn.init.normal_(self.cls_token, std=1e-6) + named_apply(init_weights_vit_timm, self) + + def interpolate_pos_encoding(self, x, w, h): + previous_dtype = x.dtype + npatch = x.shape[1] - 1 + N = self.pos_embed.shape[1] - 1 + if npatch == N and w == h: + return self.pos_embed + pos_embed = self.pos_embed.float() + class_pos_embed = pos_embed[:, 0] + patch_pos_embed = pos_embed[:, 1:] + dim = x.shape[-1] + w0 = w // self.patch_size + h0 = h // self.patch_size + # we add a small number to avoid floating point error in the interpolation + # see discussion at https://github.com/facebookresearch/dino/issues/8 + w0, h0 = w0 + 0.1, h0 + 0.1 + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed.reshape(1, int(math.sqrt(N)), int(math.sqrt(N)), + dim).permute(0, 3, 1, 2), + scale_factor=(w0 / math.sqrt(N), h0 / math.sqrt(N)), + mode='bicubic', + ) + + assert int(w0) == patch_pos_embed.shape[-2] and int( + h0) == patch_pos_embed.shape[-1] + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), + dim=1).to(previous_dtype) + + def prepare_tokens_with_masks(self, x, masks=None): + B, nc, w, h = x.shape + x = self.patch_embed(x) + if masks is not None: + x = torch.where( + masks.unsqueeze(-1), + self.mask_token.to(x.dtype).unsqueeze(0), x) + + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.interpolate_pos_encoding(x, w, h) + + return x + + def forward_features_list(self, x_list, masks_list): + x = [ + self.prepare_tokens_with_masks(x, masks) + for x, masks in zip(x_list, masks_list) + ] + for blk in self.blocks: + x = blk(x) + + all_x = x + output = [] + for x, masks in zip(all_x, masks_list): + x_norm = self.norm(x) + output.append({ + 'x_norm_clstoken': x_norm[:, 0], + 'x_norm_patchtokens': x_norm[:, 1:], + 'x_prenorm': x, + 'masks': masks, + }) + return output + + def forward_features(self, x, masks=None): + if isinstance(x, list): + return self.forward_features_list(x, masks) + + x = self.prepare_tokens_with_masks(x, masks) + + for blk in self.blocks: + x = blk(x) + + x_norm = self.norm(x) + return { + 'x_norm_clstoken': x_norm[:, 0], + 'x_norm_patchtokens': x_norm[:, 1:], + 'x_prenorm': x, + 'masks': masks, + } + + def _get_intermediate_layers_not_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + # If n is an int, take the n last blocks. If it's a list, take them + output, total_block_len = [], len(self.blocks) + blocks_to_take = range(total_block_len - n, + total_block_len) if isinstance(n, int) else n + for i, blk in enumerate(self.blocks): + x = blk(x) + if i in blocks_to_take: + output.append(x) + assert len(output) == len( + blocks_to_take + ), f'only {len(output)} / {len(blocks_to_take)} blocks found' + return output + + def _get_intermediate_layers_chunked(self, x, n=1): + x = self.prepare_tokens_with_masks(x) + output, i, total_block_len = [], 0, len(self.blocks[-1]) + # If n is an int, take the n last blocks. If it's a list, take them + blocks_to_take = range(total_block_len - n, + total_block_len) if isinstance(n, int) else n + for block_chunk in self.blocks: + for blk in block_chunk[i:]: # Passing the nn.Identity() + x = blk(x) + if i in blocks_to_take: + output.append(x) + i += 1 + assert len(output) == len( + blocks_to_take + ), f'only {len(output)} / {len(blocks_to_take)} blocks found' + return output + + def get_intermediate_layers( + self, + x: torch.Tensor, + n: Union[int, Sequence] = 1, # Layers or n last layers to take + reshape: bool = False, + return_class_token: bool = False, + norm=True, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]: + if self.chunked_blocks: + outputs = self._get_intermediate_layers_chunked(x, n) + else: + outputs = self._get_intermediate_layers_not_chunked(x, n) + if norm: + outputs = [self.norm(out) for out in outputs] + class_tokens = [out[:, 0] for out in outputs] + outputs = [out[:, 1:] for out in outputs] + if reshape: + B, _, w, h = x.shape + outputs = [ + out.reshape(B, w // self.patch_size, h // self.patch_size, + -1).permute(0, 3, 1, 2).contiguous() + for out in outputs + ] + if return_class_token: + return tuple(zip(outputs, class_tokens)) + return tuple(outputs) + + def forward(self, *args, is_training=False, **kwargs): + ret = self.forward_features(*args, **kwargs) + if is_training: + return ret + else: + return self.head(ret['x_norm_clstoken']) + + +def init_weights_vit_timm(module: nn.Module, name: str = ''): + """ViT weight initialization, original timm impl (for reproducibility)""" + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=0.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def vit_small(patch_size=16, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=384, + depth=12, + num_heads=6, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + **kwargs, + ) + return model + + +def vit_base(patch_size=16, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + **kwargs, + ) + return model + + +def vit_large(patch_size=16, **kwargs): + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1024, + depth=24, + num_heads=16, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + **kwargs, + ) + return model + + +def vit_giant2(patch_size=16, **kwargs): + """ + Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64 + """ + model = DinoVisionTransformer( + patch_size=patch_size, + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + block_fn=partial(Block, attn_class=MemEffAttention), + **kwargs, + ) + return model diff --git a/modelscope/models/cv/anydoor/dinov2/hubconf.py b/modelscope/models/cv/anydoor/dinov2/hubconf.py new file mode 100644 index 000000000..42660f64e --- /dev/null +++ b/modelscope/models/cv/anydoor/dinov2/hubconf.py @@ -0,0 +1,195 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + +dependencies = ['torch'] + +_DINOV2_BASE_URL = 'https://dl.fbaipublicfiles.com/dinov2' + + +def _make_dinov2_model_name(arch_name: str, patch_size: int) -> str: + compact_arch_name = arch_name.replace('_', '')[:4] + return f'dinov2_{compact_arch_name}{patch_size}' + + +def _make_dinov2_model( + *, + arch_name: str = 'vit_large', + img_size: int = 518, + patch_size: int = 14, + init_values: float = 1.0, + ffn_layer: str = 'mlp', + block_chunks: int = 0, + pretrained: bool = True, + **kwargs, +): + from .dinov2.models import vision_transformer as vits + + _ = _make_dinov2_model_name(arch_name, patch_size) + vit_kwargs = dict( + img_size=img_size, + patch_size=patch_size, + init_values=init_values, + ffn_layer=ffn_layer, + block_chunks=block_chunks, + ) + vit_kwargs.update(**kwargs) + model = vits.__dict__[arch_name](**vit_kwargs) + + # if pretrained: + # state_dict = torch.load('') + # model.load_state_dict(state_dict, strict=False) + return model + + +def dinov2_vits14(*, pretrained: bool = True, **kwargs): + """ + DINOv2 ViT-S/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name='vit_small', pretrained=pretrained, **kwargs) + + +def dinov2_vitb14(*, pretrained: bool = True, **kwargs): + """ + DINOv2 ViT-B/14 model pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name='vit_base', pretrained=pretrained, **kwargs) + + +def dinov2_vitl14(*, pretrained: bool = True, **kwargs): + """ + DINOv2 ViT-L/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name='vit_large', pretrained=pretrained, **kwargs) + + +def dinov2_vitg14(*, pretrained: bool = True, **kwargs): + """ + DINOv2 ViT-g/14 model (optionally) pretrained on the LVD-142M dataset. + """ + return _make_dinov2_model( + arch_name='vit_giant2', + ffn_layer='swiglufused', + pretrained=pretrained, + **kwargs) + + +def _make_dinov2_linear_head( + *, + model_name: str = 'dinov2_vitl14', + embed_dim: int = 1024, + layers: int = 4, + pretrained: bool = True, + **kwargs, +): + assert layers in (1, 4), f'Unsupported number of layers: {layers}' + linear_head = nn.Linear((1 + layers) * embed_dim, 1_000) + + if pretrained: + layers_str = str(layers) if layers == 4 else '' + url = _DINOV2_BASE_URL + f'/{model_name}/{model_name}_linear{layers_str}_head.pth' + state_dict = torch.hub.load_state_dict_from_url( + url, map_location='cpu') + linear_head.load_state_dict(state_dict, strict=False) + + return linear_head + + +class _LinearClassifierWrapper(nn.Module): + + def __init__(self, + *, + backbone: nn.Module, + linear_head: nn.Module, + layers: int = 4): + super().__init__() + self.backbone = backbone + self.linear_head = linear_head + self.layers = layers + + def forward(self, x): + if self.layers == 1: + x = self.backbone.forward_features(x) + cls_token = x['x_norm_clstoken'].squeeze(0) + patch_tokens = x['x_norm_patchtokens'].squeeze(0) + linear_input = torch.cat([cls_token, patch_tokens.mean(0)]) + elif self.layers == 4: + x = self.backbone.get_intermediate_layers( + x, n=4, return_class_token=True) + linear_input = torch.cat([ + x[0][1].squeeze(0), x[1][1].squeeze(0), x[2][1].squeeze(0), + x[3][1].squeeze(0), x[3][0].squeeze(0).mean(0) + ]) + else: + assert False, f'Unsupported number of layers: {self.layers}' + return self.linear_head(linear_input) + + +def _make_dinov2_linear_classifier( + *, + arch_name: str = 'vit_large', + layers: int = 4, + pretrained: bool = True, + **kwargs, +): + backbone = _make_dinov2_model( + arch_name=arch_name, pretrained=pretrained, **kwargs) + + embed_dim = backbone.embed_dim + patch_size = backbone.patch_size + model_name = _make_dinov2_model_name(arch_name, patch_size) + linear_head = _make_dinov2_linear_head( + model_name=model_name, + embed_dim=embed_dim, + layers=layers, + pretrained=pretrained) + + return _LinearClassifierWrapper( + backbone=backbone, linear_head=linear_head, layers=layers) + + +def dinov2_vits14_lc(*, layers: int = 4, pretrained: bool = True, **kwargs): + """ + Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-S/14 backbone (optionally) + pretrained on the LVD-142M dataset and trained on ImageNet-1k. + """ + return _make_dinov2_linear_classifier( + arch_name='vit_small', layers=layers, pretrained=pretrained, **kwargs) + + +def dinov2_vitb14_lc(*, pretrained: bool = True, **kwargs): + """ + Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-B/14 backbone (optionally) + pretrained on the LVD-142M dataset and trained on ImageNet-1k. + """ + return _make_dinov2_linear_classifier( + arch_name='vit_base', pretrained=pretrained, **kwargs) + + +def dinov2_vitl14_lc(*, pretrained: bool = True, **kwargs): + """ + Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-L/14 backbone (optionally) + pretrained on the LVD-142M dataset and trained on ImageNet-1k. + """ + return _make_dinov2_linear_classifier( + arch_name='vit_large', pretrained=pretrained, **kwargs) + + +def dinov2_vitg14_lc(*, pretrained: bool = True, **kwargs): + """ + Linear classifier (1 or 4 layers) on top of a DINOv2 ViT-g/14 backbone (optionally) + pretrained on the LVD-142M dataset and trained on ImageNet-1k. + """ + return _make_dinov2_linear_classifier( + arch_name='vit_giant2', + ffn_layer='swiglufused', + pretrained=pretrained, + **kwargs) diff --git a/modelscope/models/cv/anydoor/ldm/__init__.py b/modelscope/models/cv/anydoor/ldm/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/models/__init__.py b/modelscope/models/cv/anydoor/ldm/models/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/models/autoencoder.py b/modelscope/models/cv/anydoor/ldm/models/autoencoder.py new file mode 100644 index 000000000..cfa91c1eb --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/models/autoencoder.py @@ -0,0 +1,274 @@ +from contextlib import contextmanager + +import pytorch_lightning as pl +import torch +import torch.nn.functional as F + +from ...ldm.modules.diffusionmodules.model import Decoder, Encoder +from ...ldm.modules.distributions.distributions import \ + DiagonalGaussianDistribution +from ...ldm.modules.ema import LitEma +from ...ldm.util import instantiate_from_config + + +class AutoencoderKL(pl.LightningModule): + + def __init__(self, + ddconfig, + lossconfig, + embed_dim, + ckpt_path=None, + ignore_keys=[], + image_key='image', + colorize_nlabels=None, + monitor=None, + ema_decay=None, + learn_logvar=False): + super().__init__() + self.learn_logvar = learn_logvar + self.image_key = image_key + self.encoder = Encoder(**ddconfig) + self.decoder = Decoder(**ddconfig) + self.loss = instantiate_from_config(lossconfig) + assert ddconfig['double_z'] + self.quant_conv = torch.nn.Conv2d(2 * ddconfig['z_channels'], + 2 * embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, + ddconfig['z_channels'], 1) + self.embed_dim = embed_dim + if colorize_nlabels is not None: + assert type(colorize_nlabels) == int + self.register_buffer('colorize', + torch.randn(3, colorize_nlabels, 1, 1)) + if monitor is not None: + self.monitor = monitor + + self.use_ema = ema_decay is not None + if self.use_ema: + self.ema_decay = ema_decay + assert 0. < ema_decay < 1. + self.model_ema = LitEma(self, decay=ema_decay) + print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.') + + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def init_from_ckpt(self, path, ignore_keys=list()): + sd = torch.load(path, map_location='cpu')['state_dict'] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print('Deleting key {} from state_dict.'.format(k)) + del sd[k] + self.load_state_dict(sd, strict=False) + print(f'Restored from {path}') + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print(f'{context}: Switched to EMA weights') + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print(f'{context}: Restored training weights') + + def on_train_batch_end(self, *args, **kwargs): + if self.use_ema: + self.model_ema(self) + + def encode(self, x): + h = self.encoder(x) + moments = self.quant_conv(h) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z): + z = self.post_quant_conv(z) + dec = self.decoder(z) + return dec + + def forward(self, input, sample_posterior=True): + posterior = self.encode(input) + if sample_posterior: + z = posterior.sample() + else: + z = posterior.mode() + dec = self.decode(z) + return dec, posterior + + def get_input(self, batch, k): + x = batch[k] + if len(x.shape) == 3: + x = x[..., None] + x = x.permute(0, 3, 1, + 2).to(memory_format=torch.contiguous_format).float() + return x + + def training_step(self, batch, batch_idx, optimizer_idx): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + + if optimizer_idx == 0: + # train encoder+decoder+logvar + aeloss, log_dict_ae = self.loss( + inputs, + reconstructions, + posterior, + optimizer_idx, + self.global_step, + last_layer=self.get_last_layer(), + split='train') + self.log( + 'aeloss', + aeloss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=True) + self.log_dict( + log_dict_ae, + prog_bar=False, + logger=True, + on_step=True, + on_epoch=False) + return aeloss + + if optimizer_idx == 1: + # train the discriminator + discloss, log_dict_disc = self.loss( + inputs, + reconstructions, + posterior, + optimizer_idx, + self.global_step, + last_layer=self.get_last_layer(), + split='train') + + self.log( + 'discloss', + discloss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=True) + self.log_dict( + log_dict_disc, + prog_bar=False, + logger=True, + on_step=True, + on_epoch=False) + return discloss + + def validation_step(self, batch, batch_idx): + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + _ = self._validation_step(batch, batch_idx, postfix='_ema') + return log_dict + + def _validation_step(self, batch, batch_idx, postfix=''): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + aeloss, log_dict_ae = self.loss( + inputs, + reconstructions, + posterior, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split='val' + postfix) + + discloss, log_dict_disc = self.loss( + inputs, + reconstructions, + posterior, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split='val' + postfix) + + self.log(f'val{postfix}/rec_loss', + log_dict_ae[f'val{postfix}/rec_loss']) + self.log_dict(log_dict_ae) + self.log_dict(log_dict_disc) + return self.log_dict + + def configure_optimizers(self): + lr = self.learning_rate + ae_params_list = list(self.encoder.parameters()) + list( + self.decoder.parameters()) + list( + self.quant_conv.parameters()) + list( + self.post_quant_conv.parameters()) + if self.learn_logvar: + print(f'{self.__class__.__name__}: Learning logvar') + ae_params_list.append(self.loss.logvar) + opt_ae = torch.optim.Adam(ae_params_list, lr=lr, betas=(0.5, 0.9)) + opt_disc = torch.optim.Adam( + self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)) + return [opt_ae, opt_disc], [] + + def get_last_layer(self): + return self.decoder.conv_out.weight + + @torch.no_grad() + def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs): + log = dict() + x = self.get_input(batch, self.image_key) + x = x.to(self.device) + if not only_inputs: + xrec, posterior = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec.shape[1] > 3 + x = self.to_rgb(x) + xrec = self.to_rgb(xrec) + log['samples'] = self.decode(torch.randn_like(posterior.sample())) + log['reconstructions'] = xrec + if log_ema or self.use_ema: + with self.ema_scope(): + xrec_ema, posterior_ema = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec_ema.shape[1] > 3 + xrec_ema = self.to_rgb(xrec_ema) + log['samples_ema'] = self.decode( + torch.randn_like(posterior_ema.sample())) + log['reconstructions_ema'] = xrec_ema + log['inputs'] = x + return log + + def to_rgb(self, x): + assert self.image_key == 'segmentation' + if not hasattr(self, 'colorize'): + self.register_buffer('colorize', + torch.randn(3, x.shape[1], 1, 1).to(x)) + x = F.conv2d(x, weight=self.colorize) + x = 2. * (x - x.min()) / (x.max() - x.min()) - 1. + return x + + +class IdentityFirstStage(torch.nn.Module): + + def __init__(self, *args, vq_interface=False, **kwargs): + self.vq_interface = vq_interface + super().__init__() + + def encode(self, x, *args, **kwargs): + return x + + def decode(self, x, *args, **kwargs): + return x + + def quantize(self, x, *args, **kwargs): + if self.vq_interface: + return x, None, [None, None, None] + return x + + def forward(self, x, *args, **kwargs): + return x diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/__init__.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/ddim.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddim.py new file mode 100644 index 000000000..53a98fc73 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddim.py @@ -0,0 +1,446 @@ +"""SAMPLING ONLY.""" + +import numpy as np +import torch +from tqdm import tqdm + +from ....ldm.modules.diffusionmodules.util import ( + extract_into_tensor, make_ddim_sampling_parameters, make_ddim_timesteps, + noise_like) + + +class DDIMSampler(object): + + def __init__(self, model, schedule='linear', **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device('cuda'): + attr = attr.to(torch.device('cuda')) + setattr(self, name, attr) + + def make_schedule(self, + ddim_num_steps, + ddim_discretize='uniform', + ddim_eta=0., + verbose=True): + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[ + 0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + + def to_torch(x): + return x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', + to_torch(self.model.alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', + to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', + to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', + np.sqrt(1. - ddim_alphas)) + tmp1 = (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) + tmp2 = (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(tmp1 * tmp2) + self.register_buffer('ddim_sigmas_for_original_num_steps', + sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample(self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + dynamic_threshold=None, + ucg_schedule=None, + **kwargs): + if conditioning is not None: + if isinstance(conditioning, dict): + ctmp = conditioning[list(conditioning.keys())[0]] + while isinstance(ctmp, list): + ctmp = ctmp[0] + cbs = ctmp.shape[0] + if cbs != batch_size: + print( + f'Warning: Got {cbs} conditionings but batch-size is {batch_size}' + ) + + elif isinstance(conditioning, list): + for ctmp in conditioning: + if ctmp.shape[0] != batch_size: + print( + f'Warning: Got {cbs} conditionings but batch-size is {batch_size}' + ) + + else: + if conditioning.shape[0] != batch_size: + print( + f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}' + ) + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f'Data shape for DDIM sampling is {size}, eta {eta}') + + samples, intermediates = self.ddim_sampling( + conditioning, + size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, + x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ucg_schedule=ucg_schedule) + return samples, intermediates + + @torch.no_grad() + def ddim_sampling(self, + cond, + shape, + x_T=None, + ddim_use_original_steps=False, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + log_every_t=100, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + dynamic_threshold=None, + ucg_schedule=None): + device = self.model.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int( + min(timesteps / self.ddim_timesteps.shape[0], 1) + * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {'x_inter': [img], 'pred_x0': [img]} + time_range = reversed(range( + 0, timesteps)) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[ + 0] + print(f'Running DDIM Sampling with {total_steps} timesteps') + + iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b, ), step, device=device, dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample( + x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img + + if ucg_schedule is not None: + assert len(ucg_schedule) == len(time_range) + unconditional_guidance_scale = ucg_schedule[i] + + outs = self.p_sample_ddim( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold) + img, pred_x0 = outs + if callback: + callback(i) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(img) + intermediates['pred_x0'].append(pred_x0) + + return img, intermediates + + @torch.no_grad() + def p_sample_ddim(self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + dynamic_threshold=None): + b, *_, device = *x.shape, x.device + + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + model_output = self.model.apply_model(x, t, c) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t] * 2) + if isinstance(c, dict): + assert isinstance(unconditional_conditioning, dict) + c_in = dict() + for k in c: + if isinstance(c[k], list): + c_in[k] = [ + torch.cat( + [unconditional_conditioning[k][i], c[k][i]]) + for i in range(len(c[k])) + ] + else: + c_in[k] = torch.cat( + [unconditional_conditioning[k], c[k]]) + elif isinstance(c, list): + c_in = list() + assert isinstance(unconditional_conditioning, list) + for i in range(len(c)): + c_in.append( + torch.cat([unconditional_conditioning[i], c[i]])) + else: + c_in = torch.cat([unconditional_conditioning, c]) + model_uncond, model_t = self.model.apply_model(x_in, t_in, + c_in).chunk(2) + model_output = model_uncond + unconditional_guidance_scale * ( + model_t - model_uncond) + + if self.model.parameterization == 'v': + e_t = self.model.predict_eps_from_z_and_v(x, t, model_output) + else: + e_t = model_output + + if score_corrector is not None: + assert self.model.parameterization == 'eps', 'not implemented' + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, + **corrector_kwargs) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod \ + if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full((b, 1, 1, 1), + sqrt_one_minus_alphas[index], + device=device) + + # current prediction for x_0 + if self.model.parameterization != 'v': + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + else: + pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output) + + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + + if dynamic_threshold is not None: + raise NotImplementedError() + + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, + repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + @torch.no_grad() + def encode(self, + x0, + c, + t_enc, + use_original_steps=False, + return_intermediates=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + callback=None): + num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[ + 0] + + assert t_enc <= num_reference_steps + num_steps = t_enc + + if use_original_steps: + alphas_next = self.alphas_cumprod[:num_steps] + alphas = self.alphas_cumprod_prev[:num_steps] + else: + alphas_next = self.ddim_alphas[:num_steps] + alphas = torch.tensor(self.ddim_alphas_prev[:num_steps]) + + x_next = x0 + intermediates = [] + inter_steps = [] + for i in tqdm(range(num_steps), desc='Encoding Image'): + t = torch.full((x0.shape[0], ), + i, + device=self.model.device, + dtype=torch.long) + if unconditional_guidance_scale == 1.: + noise_pred = self.model.apply_model(x_next, t, c) + else: + assert unconditional_conditioning is not None + e_t_uncond, noise_pred = torch.chunk( + self.model.apply_model( + torch.cat((x_next, x_next)), torch.cat((t, t)), + torch.cat((unconditional_conditioning, c))), 2) + tmp = noise_pred - e_t_uncond + noise_pred = e_t_uncond + unconditional_guidance_scale * tmp + + xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next + tmp = (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt() + weighted_noise_pred = alphas_next[i].sqrt() * tmp * noise_pred + x_next = xt_weighted + weighted_noise_pred + if return_intermediates and i % (num_steps // return_intermediates + ) == 0 and i < num_steps - 1: + intermediates.append(x_next) + inter_steps.append(i) + elif return_intermediates and i >= num_steps - 2: + intermediates.append(x_next) + inter_steps.append(i) + if callback: + callback(i) + + out = {'x_encoded': x_next, 'intermediate_steps': inter_steps} + if return_intermediates: + out.update({'intermediates': intermediates}) + return x_next, out + + @torch.no_grad() + def stochastic_encode(self, x0, t, use_original_steps=False, noise=None): + # fast, but does not allow for exact reconstruction + # t serves as an index to gather the correct alphas + if use_original_steps: + sqrt_alphas_cumprod = self.sqrt_alphas_cumprod + sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod + else: + sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas) + sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas + + if noise is None: + noise = torch.randn_like(x0) + return ( + extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 + + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) + * noise) + + @torch.no_grad() + def decode(self, + x_latent, + cond, + t_start, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + use_original_steps=False, + callback=None): + + timesteps = np.arange(self.ddpm_num_timesteps + ) if use_original_steps else self.ddim_timesteps + timesteps = timesteps[:t_start] + + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + print(f'Running DDIM Sampling with {total_steps} timesteps') + + iterator = tqdm(time_range, desc='Decoding image', total=total_steps) + x_dec = x_latent + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((x_latent.shape[0], ), + step, + device=x_latent.device, + dtype=torch.long) + x_dec, _ = self.p_sample_ddim( + x_dec, + cond, + ts, + index=index, + use_original_steps=use_original_steps, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning) + if callback: + callback(i) + return x_dec diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/ddpm.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddpm.py new file mode 100644 index 000000000..78faa630e --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/ddpm.py @@ -0,0 +1,2295 @@ +""" +wild mixture of +https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py +https://github.com/openai/improved-diffusion/blob/e94489283bb876ac1477d5dd7709bbbd2d9902ce/improved_diffusion/gaussian_diffusion.py +https://github.com/CompVis/taming-transformers +-- merci +""" + +import itertools +import os +from contextlib import contextmanager, nullcontext +from functools import partial + +import numpy as np +import pytorch_lightning as pl +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange, repeat +from omegaconf import ListConfig +from pytorch_lightning.utilities.distributed import rank_zero_only +from torch.optim.lr_scheduler import LambdaLR +from torchvision.utils import make_grid +from tqdm import tqdm + +from ....ldm.models.autoencoder import AutoencoderKL, IdentityFirstStage +from ....ldm.models.diffusion.ddim import DDIMSampler +from ....ldm.modules.diffusionmodules.util import (extract_into_tensor, + make_beta_schedule, + noise_like) +from ....ldm.modules.distributions.distributions import ( + DiagonalGaussianDistribution, normal_kl) +from ....ldm.modules.ema import LitEma +from ....ldm.util import (count_params, default, exists, + instantiate_from_config, isimage, ismap, + log_txt_as_img, mean_flat) + +__conditioning_keys__ = { + 'concat': 'c_concat', + 'crossattn': 'c_crossattn', + 'adm': 'y' +} + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +def uniform_on_device(r1, r2, shape, device): + return (r1 - r2) * torch.rand(*shape, device=device) + r2 + + +class DDPM(pl.LightningModule): + # classic DDPM with Gaussian diffusion, in image space + def __init__( + self, + unet_config, + timesteps=1000, + beta_schedule='linear', + loss_type='l2', + ckpt_path=None, + ignore_keys=[], + load_only_unet=False, + monitor='val/loss', + use_ema=True, + first_stage_key='image', + image_size=256, + channels=3, + log_every_t=100, + clip_denoised=True, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + given_betas=None, + original_elbo_weight=0., + v_posterior=0., # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta + l_simple_weight=1., + conditioning_key=None, + parameterization='eps', # all assuming fixed variance schedules + scheduler_config=None, + use_positional_encodings=False, + learn_logvar=False, + logvar_init=0., + make_it_fit=False, + ucg_training=None, + reset_ema=False, + reset_num_ema_updates=False, + **kwargs): + super().__init__() + assert parameterization in [ + 'eps', 'x0', 'v' + ], 'currently only supporting "eps" and "x0" and "v"' + self.parameterization = parameterization + print( + f'{self.__class__.__name__}: Running in {self.parameterization}-prediction mode' + ) + self.cond_stage_model = None + self.clip_denoised = clip_denoised + self.log_every_t = log_every_t + self.first_stage_key = first_stage_key + self.image_size = image_size # try conv? + self.channels = channels + self.use_positional_encodings = use_positional_encodings + self.model = DiffusionWrapper(unet_config, conditioning_key) + count_params(self.model, verbose=True) + self.use_ema = use_ema + if self.use_ema: + self.model_ema = LitEma(self.model) + print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.') + + self.use_scheduler = scheduler_config is not None + if self.use_scheduler: + self.scheduler_config = scheduler_config + + self.v_posterior = v_posterior + self.original_elbo_weight = original_elbo_weight + self.l_simple_weight = l_simple_weight + + if monitor is not None: + self.monitor = monitor + self.make_it_fit = make_it_fit + if reset_ema: + assert exists(ckpt_path) + if ckpt_path is not None: + self.init_from_ckpt( + ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet) + if reset_ema: + assert self.use_ema + print( + 'Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.' + ) + self.model_ema = LitEma(self.model) + if reset_num_ema_updates: + print( + ' +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ' + ) + assert self.use_ema + self.model_ema.reset_num_updates() + + self.register_schedule( + given_betas=given_betas, + beta_schedule=beta_schedule, + timesteps=timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s) + + self.loss_type = loss_type + + self.learn_logvar = learn_logvar + logvar = torch.full( + fill_value=logvar_init, size=(self.num_timesteps, )) + if self.learn_logvar: + self.logvar = nn.Parameter(self.logvar, requires_grad=True) + else: + self.register_buffer('logvar', logvar) + + self.ucg_training = ucg_training or dict() + if self.ucg_training: + self.ucg_prng = np.random.RandomState() + + def register_schedule(self, + given_betas=None, + beta_schedule='linear', + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3): + if exists(given_betas): + betas = given_betas + else: + betas = make_beta_schedule( + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s) + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert alphas_cumprod.shape[ + 0] == self.num_timesteps, 'alphas have to be defined for each timestep' + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer('betas', to_torch(betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', + to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', + to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + to_torch(np.sqrt(1. - alphas_cumprod))) + self.register_buffer('log_one_minus_alphas_cumprod', + to_torch(np.log(1. - alphas_cumprod))) + self.register_buffer('sqrt_recip_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod - 1))) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + posterior_variance = (1 - self.v_posterior) * betas * ( + 1. - alphas_cumprod_prev) / ( + 1. - alphas_cumprod) + self.v_posterior * betas + # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + self.register_buffer('posterior_variance', + to_torch(posterior_variance)) + # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain + self.register_buffer( + 'posterior_log_variance_clipped', + to_torch(np.log(np.maximum(posterior_variance, 1e-20)))) + tmp = betas * np.sqrt(alphas_cumprod_prev) / (1. - alphas_cumprod) + self.register_buffer('posterior_mean_coef1', to_torch(tmp)) + tmp = (1. - alphas_cumprod_prev) * np.sqrt(alphas) + self.register_buffer('posterior_mean_coef2', + to_torch(tmp / (1. - alphas_cumprod))) + + if self.parameterization == 'eps': + tmp = 2 * self.posterior_variance * to_torch(alphas) + lvlb_weights = self.betas**2 / (tmp * (1 - self.alphas_cumprod)) + elif self.parameterization == 'x0': + lvlb_weights = 0.5 * np.sqrt(torch.Tensor(alphas_cumprod)) / ( + 2. * 1 - torch.Tensor(alphas_cumprod)) + elif self.parameterization == 'v': + tmp = 2 * self.posterior_variance * to_torch(alphas) + tmp = self.betas**2 (tmp * (1 - self.alphas_cumprod)) + lvlb_weights = torch.ones_like(tmp) + else: + raise NotImplementedError('mu not supported') + lvlb_weights[0] = lvlb_weights[1] + self.register_buffer('lvlb_weights', lvlb_weights, persistent=False) + assert not torch.isnan(self.lvlb_weights).all() + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.model.parameters()) + self.model_ema.copy_to(self.model) + if context is not None: + print(f'{context}: Switched to EMA weights') + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.model.parameters()) + if context is not None: + print(f'{context}: Restored training weights') + + @torch.no_grad() + def init_from_ckpt(self, path, ignore_keys=list(), only_model=False): + sd = torch.load(path, map_location='cpu') + if 'state_dict' in list(sd.keys()): + sd = sd['state_dict'] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print('Deleting key {} from state_dict.'.format(k)) + del sd[k] + if self.make_it_fit: + n_params = len([ + name for name, _ in itertools.chain(self.named_parameters(), + self.named_buffers()) + ]) + for name, param in tqdm( + itertools.chain(self.named_parameters(), + self.named_buffers()), + desc='Fitting old weights to new weights', + total=n_params): + if name not in sd: + continue + old_shape = sd[name].shape + new_shape = param.shape + assert len(old_shape) == len(new_shape) + if len(new_shape) > 2: + # we only modify first two axes + assert new_shape[2:] == old_shape[2:] + # assumes first axis corresponds to output dim + if not new_shape == old_shape: + new_param = param.clone() + old_param = sd[name] + if len(new_shape) == 1: + for i in range(new_param.shape[0]): + new_param[i] = old_param[i % old_shape[0]] + elif len(new_shape) >= 2: + for i in range(new_param.shape[0]): + for j in range(new_param.shape[1]): + new_param[i, j] = old_param[i % old_shape[0], + j % old_shape[1]] + + n_used_old = torch.ones(old_shape[1]) + for j in range(new_param.shape[1]): + n_used_old[j % old_shape[1]] += 1 + n_used_new = torch.zeros(new_shape[1]) + for j in range(new_param.shape[1]): + n_used_new[j] = n_used_old[j % old_shape[1]] + + n_used_new = n_used_new[None, :] + while len(n_used_new.shape) < len(new_shape): + n_used_new = n_used_new.unsqueeze(-1) + new_param /= n_used_new + + sd[name] = new_param + + missing, unexpected = self.load_state_dict( + sd, + strict=False) if not only_model else self.model.load_state_dict( + sd, strict=False) + print( + f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys' + ) + if len(missing) > 0: + print(f'Missing Keys:\n {missing}') + if len(unexpected) > 0: + print(f'\nUnexpected Keys:\n {unexpected}') + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) + * x_start) + variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, + x_start.shape) + log_variance = extract_into_tensor(self.log_one_minus_alphas_cumprod, + t, x_start.shape) + return mean, variance, log_variance + + def predict_start_from_noise(self, x_t, t, noise): + return ( + extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) + * x_t - extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, + x_t.shape) * noise) + + def predict_start_from_z_and_v(self, x_t, t, v): + # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) + # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t + - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, + x_t.shape) * v) + + def predict_eps_from_z_and_v(self, x_t, t, v): + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, + x_t.shape) * x_t) + + def q_posterior(self, x_start, x_t, t): + tmp1 = extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) + tmp2 = extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) + posterior_mean = (tmp1 * x_start + tmp2 * x_t) + posterior_variance = extract_into_tensor(self.posterior_variance, t, + x_t.shape) + posterior_log_variance_clipped = extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance(self, x, t, clip_denoised: bool): + model_out = self.model(x, t) + if self.parameterization == 'eps': + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == 'x0': + x_recon = model_out + if clip_denoised: + x_recon.clamp_(-1., 1.) + + model_mean, posterior_variance, posterior_log_variance = self.q_posterior( + x_start=x_recon, x_t=x, t=t) + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample(self, x, t, clip_denoised=True, repeat_noise=False): + b, *_, device = *x.shape, x.device + model_mean, _, model_log_variance = self.p_mean_variance( + x=x, t=t, clip_denoised=clip_denoised) + noise = noise_like(x.shape, device, repeat_noise) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape( + b, *((1, ) * (len(x.shape) - 1))) + return model_mean + nonzero_mask * (0.5 + * model_log_variance).exp() * noise + + @torch.no_grad() + def p_sample_loop(self, shape, return_intermediates=False): + device = self.betas.device + b = shape[0] + img = torch.randn(shape, device=device) + intermediates = [img] + for i in tqdm( + reversed(range(0, self.num_timesteps)), + desc='Sampling t', + total=self.num_timesteps): + img = self.p_sample( + img, + torch.full((b, ), i, device=device, dtype=torch.long), + clip_denoised=self.clip_denoised) + if i % self.log_every_t == 0 or i == self.num_timesteps - 1: + intermediates.append(img) + if return_intermediates: + return img, intermediates + return img + + @torch.no_grad() + def sample(self, batch_size=16, return_intermediates=False): + image_size = self.image_size + channels = self.channels + return self.p_sample_loop( + (batch_size, channels, image_size, image_size), + return_intermediates=return_intermediates) + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) + * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, + x_start.shape) * noise) + + def get_v(self, x, noise, t): + tmp1 = extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) + tmp2 = extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, + x.shape) + return (tmp1 * noise - tmp2 * x) + + def get_loss(self, pred, target, mean=True): + if self.loss_type == 'l1': + loss = (target - pred).abs() + if mean: + loss = loss.mean() + elif self.loss_type == 'l2': + if mean: + loss = torch.nn.functional.mse_loss(target, pred) + else: + loss = torch.nn.functional.mse_loss( + target, pred, reduction='none') + else: + raise NotImplementedError("unknown loss type '{loss_type}'") + + return loss + + def p_losses(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + model_out = self.model(x_noisy, t) + + loss_dict = {} + if self.parameterization == 'eps': + target = noise + elif self.parameterization == 'x0': + target = x_start + elif self.parameterization == 'v': + target = self.get_v(x_start, noise, t) + else: + raise NotImplementedError( + f'Parameterization {self.parameterization} not yet supported') + + loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3]) + + log_prefix = 'train' if self.training else 'val' + + loss_dict.update({f'{log_prefix}/loss_simple': loss.mean()}) + loss_simple = loss.mean() * self.l_simple_weight + + loss_vlb = (self.lvlb_weights[t] * loss).mean() + loss_dict.update({f'{log_prefix}/loss_vlb': loss_vlb}) + + loss = loss_simple + self.original_elbo_weight * loss_vlb + + loss_dict.update({f'{log_prefix}/loss': loss}) + + return loss, loss_dict + + def forward(self, x, *args, **kwargs): + # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size + # assert h == img_size and w == img_size, f'height and width of image must be {img_size}' + t = torch.randint( + 0, self.num_timesteps, (x.shape[0], ), device=self.device).long() + return self.p_losses(x, t, *args, **kwargs) + + def get_input(self, batch, k): + x = batch[k] + if len(x.shape) == 3: + x = x[..., None] + x = rearrange(x, 'b h w c -> b c h w') + x = x.to(memory_format=torch.contiguous_format).float() + return x + + def shared_step(self, batch): + x = self.get_input(batch, self.first_stage_key) + loss, loss_dict = self(x) + return loss, loss_dict + + def training_step(self, batch, batch_idx): + for k in self.ucg_training: + p = self.ucg_training[k]['p'] + val = self.ucg_training[k]['val'] + if val is None: + val = '' + for i in range(len(batch[k])): + if self.ucg_prng.choice(2, p=[1 - p, p]): + batch[k][i] = val + + loss, loss_dict = self.shared_step(batch) + + self.log_dict( + loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True) + + self.log( + 'global_step', + self.global_step, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False) + + if self.use_scheduler: + lr = self.optimizers().param_groups[0]['lr'] + self.log( + 'lr_abs', + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False) + + return loss + + @torch.no_grad() + def validation_step(self, batch, batch_idx): + _, loss_dict_no_ema = self.shared_step(batch) + with self.ema_scope(): + _, loss_dict_ema = self.shared_step(batch) + loss_dict_ema = { + key + '_ema': loss_dict_ema[key] + for key in loss_dict_ema + } + self.log_dict( + loss_dict_no_ema, + prog_bar=False, + logger=True, + on_step=False, + on_epoch=True) + self.log_dict( + loss_dict_ema, + prog_bar=False, + logger=True, + on_step=False, + on_epoch=True) + + def on_train_batch_end(self, *args, **kwargs): + if self.use_ema: + self.model_ema(self.model) + + def _get_rows_from_list(self, samples): + n_imgs_per_row = len(samples) + denoise_grid = rearrange(samples, 'n b c h w -> b n c h w') + denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w') + denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row) + return denoise_grid + + @torch.no_grad() + def log_images(self, + batch, + N=8, + n_row=2, + sample=True, + return_keys=None, + **kwargs): + log = dict() + x = self.get_input(batch, self.first_stage_key) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + x = x.to(self.device)[:N] + log['inputs'] = x + + # get diffusion row + diffusion_row = list() + x_start = x[:n_row] + + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(x_start) + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + diffusion_row.append(x_noisy) + + log['diffusion_row'] = self._get_rows_from_list(diffusion_row) + + if sample: + # get denoise row + with self.ema_scope('Plotting'): + samples, denoise_row = self.sample( + batch_size=N, return_intermediates=True) + + log['samples'] = samples + log['denoise_row'] = self._get_rows_from_list(denoise_row) + + if return_keys: + if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0: + return log + else: + return {key: log[key] for key in return_keys} + return log + + def configure_optimizers(self): + lr = self.learning_rate + params = list(self.model.parameters()) + if self.learn_logvar: + params = params + [self.logvar] + opt = torch.optim.AdamW(params, lr=lr) + return opt + + +class LatentDiffusion(DDPM): + """main class""" + + def __init__(self, + first_stage_config, + cond_stage_config, + num_timesteps_cond=None, + cond_stage_key='image', + cond_stage_trainable=False, + concat_mode=True, + cond_stage_forward=None, + conditioning_key=None, + scale_factor=1.0, + scale_by_std=False, + force_null_conditioning=False, + *args, + **kwargs): + self.model_dir = kwargs.get('model_dir') + self.force_null_conditioning = force_null_conditioning + self.num_timesteps_cond = default(num_timesteps_cond, 1) + self.scale_by_std = scale_by_std + assert self.num_timesteps_cond <= kwargs['timesteps'] + # for backwards compatibility after implementation of DiffusionWrapper + if conditioning_key is None: + conditioning_key = 'concat' if concat_mode else 'crossattn' + if cond_stage_config == '__is_unconditional__' and not self.force_null_conditioning: + conditioning_key = None + ckpt_path = kwargs.pop('ckpt_path', None) + reset_ema = kwargs.pop('reset_ema', False) + reset_num_ema_updates = kwargs.pop('reset_num_ema_updates', False) + ignore_keys = kwargs.pop('ignore_keys', []) + super().__init__(conditioning_key=conditioning_key, *args, **kwargs) + self.concat_mode = concat_mode + self.cond_stage_trainable = cond_stage_trainable + self.cond_stage_key = cond_stage_key + try: + self.num_downs = len( + first_stage_config.params.ddconfig.ch_mult) - 1 + except Exception: + self.num_downs = 0 + if not scale_by_std: + self.scale_factor = scale_factor + else: + self.register_buffer('scale_factor', torch.tensor(scale_factor)) + self.instantiate_first_stage(first_stage_config) + self.instantiate_cond_stage(cond_stage_config) + self.cond_stage_forward = cond_stage_forward + self.clip_denoised = False + self.bbox_tokenizer = None + + self.restarted_from_ckpt = False + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys) + self.restarted_from_ckpt = True + if reset_ema: + assert self.use_ema + print( + 'Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint.' + ) + self.model_ema = LitEma(self.model) + if reset_num_ema_updates: + print( + ' +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ ' + ) + assert self.use_ema + self.model_ema.reset_num_updates() + + def make_cond_schedule(self, ): + self.cond_ids = torch.full( + size=(self.num_timesteps, ), + fill_value=self.num_timesteps - 1, + dtype=torch.long) + ids = torch.round( + torch.linspace(0, self.num_timesteps - 1, + self.num_timesteps_cond)).long() + self.cond_ids[:self.num_timesteps_cond] = ids + + @rank_zero_only + @torch.no_grad() + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + # only for very first batch + if (self.scale_by_std and self.current_epoch == 0 + and self.global_step == 0 and batch_idx == 0 + and not self.restarted_from_ckpt): + assert self.scale_factor == 1., 'rather not use custom rescaling and std-rescaling simultaneously' + # set rescale weight to 1./std of encodings + print('### USING STD-RESCALING ###') + x = super().get_input(batch, self.first_stage_key) + x = x.to(self.device) + encoder_posterior = self.encode_first_stage(x) + z = self.get_first_stage_encoding(encoder_posterior).detach() + del self.scale_factor + self.register_buffer('scale_factor', 1. / z.flatten().std()) + print(f'setting self.scale_factor to {self.scale_factor}') + print('### USING STD-RESCALING ###') + + def register_schedule(self, + given_betas=None, + beta_schedule='linear', + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3): + super().register_schedule(given_betas, beta_schedule, timesteps, + linear_start, linear_end, cosine_s) + + self.shorten_cond_schedule = self.num_timesteps_cond > 1 + if self.shorten_cond_schedule: + self.make_cond_schedule() + + def instantiate_first_stage(self, config): + model = instantiate_from_config(config) + self.first_stage_model = model.eval() + self.first_stage_model.train = disabled_train + for param in self.first_stage_model.parameters(): + param.requires_grad = False + + def instantiate_cond_stage(self, config): + config.params.model_path = os.path.join(self.model_dir, + config.params.model_path) + if not self.cond_stage_trainable: + if config == '__is_first_stage__': + print('Using first stage also as cond stage.') + self.cond_stage_model = self.first_stage_model + elif config == '__is_unconditional__': + print( + f'Training {self.__class__.__name__} as an unconditional model.' + ) + self.cond_stage_model = None + # self.be_unconditional = True + else: + model = instantiate_from_config(config) + self.cond_stage_model = model.eval() + self.cond_stage_model.train = disabled_train + for param in self.cond_stage_model.parameters(): + param.requires_grad = False + else: + assert config != '__is_first_stage__' + assert config != '__is_unconditional__' + model = instantiate_from_config(config) + self.cond_stage_model = model + + def _get_denoise_row_from_list(self, + samples, + desc='', + force_no_decoder_quantization=False): + denoise_row = [] + for zd in tqdm(samples, desc=desc): + denoise_row.append( + self.decode_first_stage( + zd.to(self.device), + force_not_quantize=force_no_decoder_quantization)) + n_imgs_per_row = len(denoise_row) + denoise_row = torch.stack(denoise_row) # n_log_step, n_row, C, H, W + denoise_grid = rearrange(denoise_row, 'n b c h w -> b n c h w') + denoise_grid = rearrange(denoise_grid, 'b n c h w -> (b n) c h w') + denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row) + return denoise_grid + + def get_first_stage_encoding(self, encoder_posterior): + if isinstance(encoder_posterior, DiagonalGaussianDistribution): + z = encoder_posterior.sample() + elif isinstance(encoder_posterior, torch.Tensor): + z = encoder_posterior + else: + raise NotImplementedError( + f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented" + ) + return self.scale_factor * z + + def get_learned_conditioning(self, c): + # c 1,3,224,224 + if self.cond_stage_forward is None: + if hasattr(self.cond_stage_model, 'encode') and callable( + self.cond_stage_model.encode): + # 1,1,1024 + c = self.cond_stage_model.encode(c) + if isinstance(c, DiagonalGaussianDistribution): + c = c.mode() + else: + c = self.cond_stage_model(c) + else: + assert hasattr(self.cond_stage_model, self.cond_stage_forward) + c = getattr(self.cond_stage_model, self.cond_stage_forward)(c) + return c + + def meshgrid(self, h, w): + y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1) + x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1) + + arr = torch.cat([y, x], dim=-1) + return arr + + def delta_border(self, h, w): + """ + :param h: height + :param w: width + :return: normalized distance to image border, + wtith min distance = 0 at border and max dist = 0.5 at image center + """ + lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2) + arr = self.meshgrid(h, w) / lower_right_corner + dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0] + dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0] + edge_dist = torch.min( + torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1)[0] + return edge_dist + + def get_weighting(self, h, w, Ly, Lx, device): + weighting = self.delta_border(h, w) + weighting = torch.clip( + weighting, + self.split_input_params['clip_min_weight'], + self.split_input_params['clip_max_weight'], + ) + weighting = weighting.view(1, h * w, 1).repeat(1, 1, + Ly * Lx).to(device) + + if self.split_input_params['tie_braker']: + L_weighting = self.delta_border(Ly, Lx) + L_weighting = torch.clip( + L_weighting, self.split_input_params['clip_min_tie_weight'], + self.split_input_params['clip_max_tie_weight']) + + L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device) + weighting = weighting * L_weighting + return weighting + + def get_fold_unfold(self, + x, + kernel_size, + stride, + uf=1, + df=1): # todo load once not every time, shorten code + """ + :param x: img of size (bs, c, h, w) + :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1]) + """ + bs, nc, h, w = x.shape + + # number of crops in image + Ly = (h - kernel_size[0]) // stride[0] + 1 + Lx = (w - kernel_size[1]) // stride[1] + 1 + + if uf == 1 and df == 1: + fold_params = dict( + kernel_size=kernel_size, dilation=1, padding=0, stride=stride) + unfold = torch.nn.Unfold(**fold_params) + + fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params) + + weighting = self.get_weighting(kernel_size[0], kernel_size[1], Ly, + Lx, x.device).to(x.dtype) + normalization = fold(weighting).view(1, 1, h, + w) # normalizes the overlap + weighting = weighting.view( + (1, 1, kernel_size[0], kernel_size[1], Ly * Lx)) + + elif uf > 1 and df == 1: + fold_params = dict( + kernel_size=kernel_size, dilation=1, padding=0, stride=stride) + unfold = torch.nn.Unfold(**fold_params) + + fold_params2 = dict( + kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf), + dilation=1, + padding=0, + stride=(stride[0] * uf, stride[1] * uf)) + fold = torch.nn.Fold( + output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2) + + weighting = self.get_weighting(kernel_size[0] * uf, + kernel_size[1] * uf, Ly, Lx, + x.device).to(x.dtype) + normalization = fold(weighting).view( + 1, 1, h * uf, w * uf) # normalizes the overlap + weighting = weighting.view( + (1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx)) + + elif df > 1 and uf == 1: + fold_params = dict( + kernel_size=kernel_size, dilation=1, padding=0, stride=stride) + unfold = torch.nn.Unfold(**fold_params) + + fold_params2 = dict( + kernel_size=(kernel_size[0] // df, kernel_size[0] // df), + dilation=1, + padding=0, + stride=(stride[0] // df, stride[1] // df)) + fold = torch.nn.Fold( + output_size=(x.shape[2] // df, x.shape[3] // df), + **fold_params2) + + weighting = self.get_weighting(kernel_size[0] // df, + kernel_size[1] // df, Ly, Lx, + x.device).to(x.dtype) + normalization = fold(weighting).view( + 1, 1, h // df, w // df) # normalizes the overlap + weighting = weighting.view( + (1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx)) + + else: + raise NotImplementedError + + return fold, unfold, normalization, weighting + + @torch.no_grad() + def get_input(self, + batch, + k, + return_first_stage_outputs=False, + force_c_encode=False, + cond_key=None, + return_original_cond=False, + bs=None, + return_x=False): + x = super().get_input(batch, k) + if bs is not None: + x = x[:bs] + x = x.to(self.device) + encoder_posterior = self.encode_first_stage(x) + z = self.get_first_stage_encoding(encoder_posterior).detach() + + if self.model.conditioning_key is not None and not self.force_null_conditioning: + if cond_key is None: + cond_key = self.cond_stage_key + if cond_key != self.first_stage_key: + if cond_key in ['caption', 'coordinates_bbox', 'txt']: + xc = batch[cond_key] + elif cond_key in ['class_label', 'cls']: + xc = batch + else: + xc = super().get_input(batch, cond_key).to(self.device) + else: + xc = x + if not self.cond_stage_trainable or force_c_encode: + if isinstance(xc, dict) or isinstance(xc, list): + c = self.get_learned_conditioning(xc) + else: + c = self.get_learned_conditioning(xc.to(self.device)) + else: + c = xc + if bs is not None: + c = c[:bs] + + if self.use_positional_encodings: + pos_x, pos_y = self.compute_latent_shifts(batch) + ckey = __conditioning_keys__[self.model.conditioning_key] + c = {ckey: c, 'pos_x': pos_x, 'pos_y': pos_y} + + else: + c = None + xc = None + if self.use_positional_encodings: + pos_x, pos_y = self.compute_latent_shifts(batch) + c = {'pos_x': pos_x, 'pos_y': pos_y} + out = [z, c] + if return_first_stage_outputs: + xrec = self.decode_first_stage(z) + out.extend([x, xrec]) + if return_x: + out.extend([x]) + if return_original_cond: + out.append(xc) + return out + + @torch.no_grad() + def decode_first_stage(self, + z, + predict_cids=False, + force_not_quantize=False): + if predict_cids: + if z.dim() == 4: + z = torch.argmax(z.exp(), dim=1).long() + z = self.first_stage_model.quantize.get_codebook_entry( + z, shape=None) + z = rearrange(z, 'b h w c -> b c h w').contiguous() + + z = 1. / self.scale_factor * z + return self.first_stage_model.decode(z) + + @torch.no_grad() + def encode_first_stage(self, x): + return self.first_stage_model.encode(x) + + def shared_step(self, batch, **kwargs): + x, c = self.get_input(batch, self.first_stage_key) + loss = self(x, c) + return loss + + def forward(self, x, c, *args, **kwargs): + # t = torch.randint(0, self.num_timesteps, (x.shape[0],), device=self.device).long() + t = self.time_steps.reshape((x.shape[0], )).to(self.device).long() + + if self.model.conditioning_key is not None: + assert c is not None + if self.cond_stage_trainable: + c = self.get_learned_conditioning(c) + if self.shorten_cond_schedule: # TODO: drop this option + tc = self.cond_ids[t].to(self.device) + c = self.q_sample( + x_start=c, t=tc, noise=torch.randn_like(c.float())) + return self.p_losses(x, c, t, *args, **kwargs) + + def apply_model(self, x_noisy, t, cond, return_ids=False): + if isinstance(cond, dict): + # hybrid case, cond is expected to be a dict + pass + else: + if not isinstance(cond, list): + cond = [cond] + key = 'c_concat' if self.model.conditioning_key == 'concat' else 'c_crossattn' + cond = {key: cond} + + x_recon = self.model(x_noisy, t, **cond) + + if isinstance(x_recon, tuple) and not return_ids: + return x_recon[0] + else: + return x_recon + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + tmp1 = extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, + x_t.shape) + tmp2 = extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, + x_t.shape) + return (tmp1 * x_t - pred_xstart) / tmp2 + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + This term can't be optimized, as it only depends on the encoder. + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = torch.tensor( + [self.num_timesteps - 1] * batch_size, device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl( + mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0) + return mean_flat(kl_prior) / np.log(2.0) + + def p_losses(self, x_start, cond, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + model_output = self.apply_model(x_noisy, t, cond) + + loss_dict = {} + prefix = 'train' if self.training else 'val' + + if self.parameterization == 'x0': + target = x_start + elif self.parameterization == 'eps': + target = noise + elif self.parameterization == 'v': + target = self.get_v(x_start, noise, t) + else: + raise NotImplementedError() + + loss_simple = self.get_loss(model_output, target, mean=False) + # boundary = self.boundary.to(loss_simple.device) + # boundary = F.interpolate(boundary, size = (64,64)) * 5 + 1.0 #16,1,64,64 + + # print(loss_simple.shape) #16,4,64,64 + loss_simple = loss_simple.mean([1, 2, 3]) + # .mean([1, 2, 3]) + loss_dict.update({f'{prefix}/loss_simple': loss_simple.mean()}) + + logvar_t = self.logvar[t].to(self.device) + loss = loss_simple / torch.exp(logvar_t) + logvar_t + # loss = loss_simple / torch.exp(self.logvar) + self.logvar + if self.learn_logvar: + loss_dict.update({f'{prefix}/loss_gamma': loss.mean()}) + loss_dict.update({'logvar': self.logvar.data.mean()}) + + loss = self.l_simple_weight * loss.mean() + + loss_vlb = self.get_loss( + model_output, target, mean=False).mean(dim=(1, 2, 3)) + loss_vlb = (self.lvlb_weights[t] * loss_vlb).mean() + loss_dict.update({f'{prefix}/loss_vlb': loss_vlb}) + loss += (self.original_elbo_weight * loss_vlb) + loss_dict.update({f'{prefix}/loss': loss}) + + # print(self.parameterization, self.learn_logvar, self.original_elbo_weight, self.lvlb_weights[t]) + + return loss, loss_dict + + def p_mean_variance(self, + x, + c, + t, + clip_denoised: bool, + return_codebook_ids=False, + quantize_denoised=False, + return_x0=False, + score_corrector=None, + corrector_kwargs=None): + t_in = t + model_out = self.apply_model( + x, t_in, c, return_ids=return_codebook_ids) + + if score_corrector is not None: + assert self.parameterization == 'eps' + model_out = score_corrector.modify_score(self, model_out, x, t, c, + **corrector_kwargs) + + if return_codebook_ids: + model_out, logits = model_out + + if self.parameterization == 'eps': + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == 'x0': + x_recon = model_out + else: + raise NotImplementedError() + + if clip_denoised: + x_recon.clamp_(-1., 1.) + if quantize_denoised: + x_recon, _, [_, _, + indices] = self.first_stage_model.quantize(x_recon) + model_mean, posterior_variance, posterior_log_variance = self.q_posterior( + x_start=x_recon, x_t=x, t=t) + if return_codebook_ids: + return model_mean, posterior_variance, posterior_log_variance, logits + elif return_x0: + return model_mean, posterior_variance, posterior_log_variance, x_recon + else: + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample(self, + x, + c, + t, + clip_denoised=False, + repeat_noise=False, + return_codebook_ids=False, + quantize_denoised=False, + return_x0=False, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None): + b, *_, device = *x.shape, x.device + outputs = self.p_mean_variance( + x=x, + c=c, + t=t, + clip_denoised=clip_denoised, + return_codebook_ids=return_codebook_ids, + quantize_denoised=quantize_denoised, + return_x0=return_x0, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs) + if return_codebook_ids: + raise DeprecationWarning('Support dropped.') + model_mean, _, model_log_variance, logits = outputs + elif return_x0: + model_mean, _, model_log_variance, x0 = outputs + else: + model_mean, _, model_log_variance = outputs + + noise = noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape( + b, *((1, ) * (len(x.shape) - 1))) + + if return_codebook_ids: + return model_mean + nonzero_mask * ( + 0.5 * model_log_variance).exp() * noise, logits.argmax(dim=1) + if return_x0: + return model_mean + nonzero_mask * ( + 0.5 * model_log_variance).exp() * noise, x0 + else: + return model_mean + nonzero_mask * ( + 0.5 * model_log_variance).exp() * noise + + @torch.no_grad() + def progressive_denoising(self, + cond, + shape, + verbose=True, + callback=None, + quantize_denoised=False, + img_callback=None, + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + batch_size=None, + x_T=None, + start_T=None, + log_every_t=None): + if not log_every_t: + log_every_t = self.log_every_t + timesteps = self.num_timesteps + if batch_size is not None: + b = batch_size if batch_size is not None else shape[0] + shape = [batch_size] + list(shape) + else: + b = batch_size = shape[0] + if x_T is None: + img = torch.randn(shape, device=self.device) + else: + img = x_T + intermediates = [] + if cond is not None: + if isinstance(cond, dict): + cond = { + key: + cond[key][:batch_size] if not isinstance(cond[key], list) + else list(map(lambda x: x[:batch_size], cond[key])) + for key in cond + } + else: + cond = [c[:batch_size] for c in cond] if isinstance( + cond, list) else cond[:batch_size] + + if start_T is not None: + timesteps = min(timesteps, start_T) + iterator = tqdm( + reversed(range(0, timesteps)), + desc='Progressive Generation', + total=timesteps) if verbose else reversed(range(0, timesteps)) + if type(temperature) == float: + temperature = [temperature] * timesteps + + for i in iterator: + ts = torch.full((b, ), i, device=self.device, dtype=torch.long) + if self.shorten_cond_schedule: + assert self.model.conditioning_key != 'hybrid' + tc = self.cond_ids[ts].to(cond.device) + cond = self.q_sample( + x_start=cond, t=tc, noise=torch.randn_like(cond)) + + img, x0_partial = self.p_sample( + img, + cond, + ts, + clip_denoised=self.clip_denoised, + quantize_denoised=quantize_denoised, + return_x0=True, + temperature=temperature[i], + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs) + if mask is not None: + assert x0 is not None + img_orig = self.q_sample(x0, ts) + img = img_orig * mask + (1. - mask) * img + + if i % log_every_t == 0 or i == timesteps - 1: + intermediates.append(x0_partial) + if callback: + callback(i) + if img_callback: + img_callback(img, i) + return img, intermediates + + @torch.no_grad() + def p_sample_loop(self, + cond, + shape, + return_intermediates=False, + x_T=None, + verbose=True, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + start_T=None, + log_every_t=None): + + if not log_every_t: + log_every_t = self.log_every_t + device = self.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + intermediates = [img] + if timesteps is None: + timesteps = self.num_timesteps + + if start_T is not None: + timesteps = min(timesteps, start_T) + iterator = tqdm( + reversed(range(0, timesteps)), desc='Sampling t', + total=timesteps) if verbose else reversed(range(0, timesteps)) + + if mask is not None: + assert x0 is not None + assert x0.shape[2:3] == mask.shape[2: + 3] # spatial size has to match + + for i in iterator: + ts = torch.full((b, ), i, device=device, dtype=torch.long) + if self.shorten_cond_schedule: + assert self.model.conditioning_key != 'hybrid' + tc = self.cond_ids[ts].to(cond.device) + cond = self.q_sample( + x_start=cond, t=tc, noise=torch.randn_like(cond)) + + img = self.p_sample( + img, + cond, + ts, + clip_denoised=self.clip_denoised, + quantize_denoised=quantize_denoised) + if mask is not None: + img_orig = self.q_sample(x0, ts) + img = img_orig * mask + (1. - mask) * img + + if i % log_every_t == 0 or i == timesteps - 1: + intermediates.append(img) + if callback: + callback(i) + if img_callback: + img_callback(img, i) + + if return_intermediates: + return img, intermediates + return img + + @torch.no_grad() + def sample(self, + cond, + batch_size=16, + return_intermediates=False, + x_T=None, + verbose=True, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + shape=None, + **kwargs): + if shape is None: + shape = (batch_size, self.channels, self.image_size, + self.image_size) + if cond is not None: + if isinstance(cond, dict): + cond = { + key: + cond[key][:batch_size] if not isinstance(cond[key], list) + else list(map(lambda x: x[:batch_size], cond[key])) + for key in cond + } + else: + cond = [c[:batch_size] for c in cond] if isinstance( + cond, list) else cond[:batch_size] + return self.p_sample_loop( + cond, + shape, + return_intermediates=return_intermediates, + x_T=x_T, + verbose=verbose, + timesteps=timesteps, + quantize_denoised=quantize_denoised, + mask=mask, + x0=x0) + + @torch.no_grad() + def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs): + if ddim: + ddim_sampler = DDIMSampler(self) + shape = (self.channels, self.image_size, self.image_size) + samples, intermediates = ddim_sampler.sample( + ddim_steps, batch_size, shape, cond, verbose=False, **kwargs) + + else: + samples, intermediates = self.sample( + cond=cond, + batch_size=batch_size, + return_intermediates=True, + **kwargs) + + return samples, intermediates + + @torch.no_grad() + def get_unconditional_conditioning(self, batch_size, null_label=None): + if null_label is not None: + xc = null_label + if isinstance(xc, ListConfig): + xc = list(xc) + if isinstance(xc, dict) or isinstance(xc, list): + c = self.get_learned_conditioning(xc) + else: + if hasattr(xc, 'to'): + xc = xc.to(self.device) + c = self.get_learned_conditioning(xc) + else: + if self.cond_stage_key in ['class_label', 'cls']: + xc = self.cond_stage_model.get_unconditional_conditioning( + batch_size, device=self.device) + return self.get_learned_conditioning(xc) + else: + raise NotImplementedError('todo') + if isinstance(c, list): # in case the encoder gives us a list + for i in range(len(c)): + c[i] = repeat( + c[i], '1 ... -> b ...', b=batch_size).to(self.device) + else: + c = repeat(c, '1 ... -> b ...', b=batch_size).to(self.device) + return c + + @torch.no_grad() + def log_images(self, + batch, + N=8, + n_row=4, + sample=True, + ddim_steps=50, + ddim_eta=0., + return_keys=None, + quantize_denoised=True, + inpaint=True, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=True, + unconditional_guidance_scale=1., + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs): + ema_scope = self.ema_scope if use_ema_scope else nullcontext + use_ddim = ddim_steps is not None + + log = dict() + z, c, x, xrec, xc = self.get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=N) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + log['inputs'] = x + log['reconstruction'] = xrec + if self.model.conditioning_key is not None: + if hasattr(self.cond_stage_model, 'decode'): + xc = self.cond_stage_model.decode(c) + log['conditioning'] = xc + elif self.cond_stage_key in ['caption', 'txt']: + xc = log_txt_as_img((x.shape[2], x.shape[3]), + batch[self.cond_stage_key], + size=x.shape[2] // 25) + log['conditioning'] = xc + elif self.cond_stage_key in ['class_label', 'cls']: + try: + xc = log_txt_as_img((x.shape[2], x.shape[3]), + batch['human_label'], + size=x.shape[2] // 25) + log['conditioning'] = xc + except KeyError: + # probably no "human_label" in batch + pass + elif isimage(xc): + log['conditioning'] = xc + if ismap(xc): + log['original_conditioning'] = self.to_rgb(xc) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack( + diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w') + diffusion_grid = rearrange(diffusion_grid, + 'b n c h w -> (b n) c h w') + diffusion_grid = make_grid( + diffusion_grid, nrow=diffusion_row.shape[0]) + log['diffusion_row'] = diffusion_grid + + if sample: + # get denoise row + with ema_scope('Sampling'): + samples, z_denoise_row = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True) + x_samples = self.decode_first_stage(samples) + log['samples'] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log['denoise_row'] = denoise_grid + + if quantize_denoised and not isinstance( + self.first_stage_model, AutoencoderKL) and not isinstance( + self.first_stage_model, IdentityFirstStage): + # also display when quantizing x0 while sampling + with ema_scope('Plotting Quantized Denoised'): + samples, z_denoise_row = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + quantize_denoised=True) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True, + # quantize_denoised=True) + x_samples = self.decode_first_stage(samples.to(self.device)) + log['samples_x0_quantized'] = x_samples + + if unconditional_guidance_scale > 1.0: + uc = self.get_unconditional_conditioning( + N, unconditional_guidance_label) + if self.model.conditioning_key == 'crossattn-adm': + uc = {'c_crossattn': [uc], 'c_adm': c['c_adm']} + with ema_scope('Sampling with classifier-free guidance'): + samples_cfg, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg + + if inpaint: + # make a simple center square + _, h, w = z.shape[0], z.shape[2], z.shape[3] + mask = torch.ones(N, h, w).to(self.device) + # zeros will be filled in + mask[:, h // 4:3 * h // 4, w // 4:3 * w // 4] = 0. + mask = mask[:, None, ...] + with ema_scope('Plotting Inpaint'): + samples, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + eta=ddim_eta, + ddim_steps=ddim_steps, + x0=z[:N], + mask=mask) + x_samples = self.decode_first_stage(samples.to(self.device)) + log['samples_inpainting'] = x_samples + log['mask'] = mask + + # outpaint + mask = 1. - mask + with ema_scope('Plotting Outpaint'): + samples, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + eta=ddim_eta, + ddim_steps=ddim_steps, + x0=z[:N], + mask=mask) + x_samples = self.decode_first_stage(samples.to(self.device)) + log['samples_outpainting'] = x_samples + + if plot_progressive_rows: + with ema_scope('Plotting Progressives'): + img, progressives = self.progressive_denoising( + c, + shape=(self.channels, self.image_size, self.image_size), + batch_size=N) + prog_row = self._get_denoise_row_from_list( + progressives, desc='Progressive Generation') + log['progressive_row'] = prog_row + + if return_keys: + if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0: + return log + else: + return {key: log[key] for key in return_keys} + return log + + def configure_optimizers(self): + lr = self.learning_rate + params = list(self.model.parameters()) + if self.cond_stage_trainable: + print( + f'{self.__class__.__name__}: Also optimizing conditioner params!' + ) + params = params + list(self.cond_stage_model.parameters()) + if self.learn_logvar: + print('Diffusion model optimizing logvar') + params.append(self.logvar) + opt = torch.optim.AdamW(params, lr=lr) + if self.use_scheduler: + assert 'target' in self.scheduler_config + scheduler = instantiate_from_config(self.scheduler_config) + + print('Setting up LambdaLR scheduler...') + scheduler = [{ + 'scheduler': + LambdaLR(opt, lr_lambda=scheduler.schedule), + 'interval': + 'step', + 'frequency': + 1 + }] + return [opt], scheduler + return opt + + @torch.no_grad() + def to_rgb(self, x): + x = x.float() + if not hasattr(self, 'colorize'): + self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x) + x = nn.functional.conv2d(x, weight=self.colorize) + x = 2. * (x - x.min()) / (x.max() - x.min()) - 1. + return x + + +class DiffusionWrapper(pl.LightningModule): + + def __init__(self, diff_model_config, conditioning_key): + super().__init__() + self.sequential_cross_attn = diff_model_config.pop( + 'sequential_crossattn', False) + self.diffusion_model = instantiate_from_config(diff_model_config) + self.conditioning_key = conditioning_key + assert self.conditioning_key in [ + None, 'concat', 'crossattn', 'hybrid', 'adm', 'hybrid-adm', + 'crossattn-adm' + ] + + def forward(self, + x, + t, + c_concat: list = None, + c_crossattn: list = None, + c_adm=None): + if self.conditioning_key is None: + out = self.diffusion_model(x, t) + elif self.conditioning_key == 'concat': + xc = torch.cat([x] + c_concat, dim=1) + out = self.diffusion_model(xc, t) + elif self.conditioning_key == 'crossattn': + if not self.sequential_cross_attn: + cc = torch.cat(c_crossattn, 1) + else: + cc = c_crossattn + out = self.diffusion_model(x, t, context=cc) + elif self.conditioning_key == 'hybrid': + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc) + elif self.conditioning_key == 'hybrid-adm': + assert c_adm is not None + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc, y=c_adm) + elif self.conditioning_key == 'crossattn-adm': + assert c_adm is not None + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(x, t, context=cc, y=c_adm) + elif self.conditioning_key == 'adm': + cc = c_crossattn[0] + out = self.diffusion_model(x, t, y=cc) + else: + raise NotImplementedError() + + return out + + +class LatentUpscaleDiffusion(LatentDiffusion): + + def __init__(self, + *args, + low_scale_config, + low_scale_key='LR', + noise_level_key=None, + **kwargs): + super().__init__(*args, **kwargs) + # assumes that neither the cond_stage nor the low_scale_model contain trainable params + assert not self.cond_stage_trainable + self.instantiate_low_stage(low_scale_config) + self.low_scale_key = low_scale_key + self.noise_level_key = noise_level_key + + def instantiate_low_stage(self, config): + model = instantiate_from_config(config) + self.low_scale_model = model.eval() + self.low_scale_model.train = disabled_train + for param in self.low_scale_model.parameters(): + param.requires_grad = False + + @torch.no_grad() + def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False): + if not log_mode: + z, c = super().get_input(batch, k, force_c_encode=True, bs=bs) + else: + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs) + x_low = batch[self.low_scale_key][:bs] + x_low = rearrange(x_low, 'b h w c -> b c h w') + x_low = x_low.to(memory_format=torch.contiguous_format).float() + zx, noise_level = self.low_scale_model(x_low) + if self.noise_level_key is not None: + # get noise level from batch instead, e.g. when extracting a custom noise level for bsr + raise NotImplementedError('TODO') + + all_conds = { + 'c_concat': [zx], + 'c_crossattn': [c], + 'c_adm': noise_level + } + if log_mode: + # TODO: maybe disable if too expensive + x_low_rec = self.low_scale_model.decode(zx) + return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level + return z, all_conds + + @torch.no_grad() + def log_images(self, + batch, + N=8, + n_row=4, + sample=True, + ddim_steps=200, + ddim_eta=1., + return_keys=None, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=True, + unconditional_guidance_scale=1., + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs): + ema_scope = self.ema_scope if use_ema_scope else nullcontext + use_ddim = ddim_steps is not None + + log = dict() + z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input( + batch, self.first_stage_key, bs=N, log_mode=True) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + log['inputs'] = x + log['reconstruction'] = xrec + log['x_lr'] = x_low + log[f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}"] = x_low_rec + if self.model.conditioning_key is not None: + if hasattr(self.cond_stage_model, 'decode'): + xc = self.cond_stage_model.decode(c) + log['conditioning'] = xc + elif self.cond_stage_key in ['caption', 'txt']: + xc = log_txt_as_img((x.shape[2], x.shape[3]), + batch[self.cond_stage_key], + size=x.shape[2] // 25) + log['conditioning'] = xc + elif self.cond_stage_key in ['class_label', 'cls']: + xc = log_txt_as_img((x.shape[2], x.shape[3]), + batch['human_label'], + size=x.shape[2] // 25) + log['conditioning'] = xc + elif isimage(xc): + log['conditioning'] = xc + if ismap(xc): + log['original_conditioning'] = self.to_rgb(xc) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack( + diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w') + diffusion_grid = rearrange(diffusion_grid, + 'b n c h w -> (b n) c h w') + diffusion_grid = make_grid( + diffusion_grid, nrow=diffusion_row.shape[0]) + log['diffusion_row'] = diffusion_grid + + if sample: + # get denoise row + with ema_scope('Sampling'): + samples, z_denoise_row = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True) + x_samples = self.decode_first_stage(samples) + log['samples'] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log['denoise_row'] = denoise_grid + + if unconditional_guidance_scale > 1.0: + uc_tmp = self.get_unconditional_conditioning( + N, unconditional_guidance_label) + # TODO explore better "unconditional" choices for the other keys + # maybe guide away from empty text label and highest noise level and maximally degraded zx? + uc = dict() + for k in c: + if k == 'c_crossattn': + assert isinstance(c[k], list) and len(c[k]) == 1 + uc[k] = [uc_tmp] + elif k == 'c_adm': # todo: only run with text-based guidance? + assert isinstance(c[k], torch.Tensor) + # uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level + uc[k] = c[k] + elif isinstance(c[k], list): + uc[k] = [c[k][i] for i in range(len(c[k]))] + else: + uc[k] = c[k] + + with ema_scope('Sampling with classifier-free guidance'): + samples_cfg, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg + + if plot_progressive_rows: + with ema_scope('Plotting Progressives'): + img, progressives = self.progressive_denoising( + c, + shape=(self.channels, self.image_size, self.image_size), + batch_size=N) + prog_row = self._get_denoise_row_from_list( + progressives, desc='Progressive Generation') + log['progressive_row'] = prog_row + + return log + + +class LatentFinetuneDiffusion(LatentDiffusion): + """ + Basis for different finetunas, such as inpainting or depth2image + To disable finetuning mode, set finetune_keys to None + """ + + def __init__( + self, + concat_keys: tuple, + finetune_keys=('model.diffusion_model.input_blocks.0.0.weight', + 'model_ema.diffusion_modelinput_blocks00weight'), + keep_finetune_dims=4, + # if model was trained without concat mode before and we would like to keep these channels + c_concat_log_start=None, # to log reconstruction of c_concat codes + c_concat_log_end=None, + *args, + **kwargs): + ckpt_path = kwargs.pop('ckpt_path', None) + ignore_keys = kwargs.pop('ignore_keys', list()) + super().__init__(*args, **kwargs) + self.finetune_keys = finetune_keys + self.concat_keys = concat_keys + self.keep_dims = keep_finetune_dims + self.c_concat_log_start = c_concat_log_start + self.c_concat_log_end = c_concat_log_end + if exists(self.finetune_keys): + assert exists( + ckpt_path), 'can only finetune from a given checkpoint' + if exists(ckpt_path): + self.init_from_ckpt(ckpt_path, ignore_keys) + + def init_from_ckpt(self, path, ignore_keys=list(), only_model=False): + sd = torch.load(path, map_location='cpu') + if 'state_dict' in list(sd.keys()): + sd = sd['state_dict'] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print('Deleting key {} from state_dict.'.format(k)) + del sd[k] + + # make it explicit, finetune by including extra input channels + if exists(self.finetune_keys) and k in self.finetune_keys: + new_entry = None + for name, param in self.named_parameters(): + if name in self.finetune_keys: + print( + f"modifying key '{name}' and keeping its " + f'original {self.keep_dims} (channels) dimensions only' + ) + new_entry = torch.zeros_like(param) # zero init + assert exists( + new_entry), 'did not find matching parameter to modify' + new_entry[:, :self.keep_dims, ...] = sd[k] + sd[k] = new_entry + + missing, unexpected = self.load_state_dict( + sd, + strict=False) if not only_model else self.model.load_state_dict( + sd, strict=False) + print( + f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys' + ) + if len(missing) > 0: + print(f'Missing Keys: {missing}') + if len(unexpected) > 0: + print(f'Unexpected Keys: {unexpected}') + + @torch.no_grad() + def log_images(self, + batch, + N=8, + n_row=4, + sample=True, + ddim_steps=200, + ddim_eta=1., + return_keys=None, + quantize_denoised=True, + inpaint=True, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=True, + unconditional_guidance_scale=1., + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs): + ema_scope = self.ema_scope if use_ema_scope else nullcontext + use_ddim = ddim_steps is not None + + log = dict() + z, c, x, xrec, xc = self.get_input( + batch, self.first_stage_key, bs=N, return_first_stage_outputs=True) + c_cat, c = c['c_concat'][0], c['c_crossattn'][0] + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + log['inputs'] = x + log['reconstruction'] = xrec + if self.model.conditioning_key is not None: + if hasattr(self.cond_stage_model, 'decode'): + xc = self.cond_stage_model.decode(c) + log['conditioning'] = xc + elif self.cond_stage_key in ['caption', 'txt']: + xc = log_txt_as_img((x.shape[2], x.shape[3]), + batch[self.cond_stage_key], + size=x.shape[2] // 25) + log['conditioning'] = xc + elif self.cond_stage_key in ['class_label', 'cls']: + xc = log_txt_as_img((x.shape[2], x.shape[3]), + batch['human_label'], + size=x.shape[2] // 25) + log['conditioning'] = xc + elif isimage(xc): + log['conditioning'] = xc + if ismap(xc): + log['original_conditioning'] = self.to_rgb(xc) + + if not (self.c_concat_log_start is None + and self.c_concat_log_end is None): + log['c_concat_decoded'] = self.decode_first_stage( + c_cat[:, self.c_concat_log_start:self.c_concat_log_end]) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack( + diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w') + diffusion_grid = rearrange(diffusion_grid, + 'b n c h w -> (b n) c h w') + diffusion_grid = make_grid( + diffusion_grid, nrow=diffusion_row.shape[0]) + log['diffusion_row'] = diffusion_grid + + if sample: + # get denoise row + with ema_scope('Sampling'): + samples, z_denoise_row = self.sample_log( + cond={ + 'c_concat': [c_cat], + 'c_crossattn': [c] + }, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True) + x_samples = self.decode_first_stage(samples) + log['samples'] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log['denoise_row'] = denoise_grid + + if unconditional_guidance_scale > 1.0: + uc_cross = self.get_unconditional_conditioning( + N, unconditional_guidance_label) + uc_cat = c_cat + uc_full = {'c_concat': [uc_cat], 'c_crossattn': [uc_cross]} + with ema_scope('Sampling with classifier-free guidance'): + samples_cfg, _ = self.sample_log( + cond={ + 'c_concat': [c_cat], + 'c_crossattn': [c] + }, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc_full, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[f'samples_cfg_scale_{unconditional_guidance_scale:.2f}'] = x_samples_cfg + + return log + + +class LatentInpaintDiffusion(LatentFinetuneDiffusion): + """ + can either run as pure inpainting model (only concat mode) or with mixed conditionings, + e.g. mask as concat and text via cross-attn. + To disable finetuning mode, set finetune_keys to None + """ + + def __init__(self, + concat_keys=('mask', 'masked_image'), + masked_image_key='masked_image', + *args, + **kwargs): + super().__init__(concat_keys, *args, **kwargs) + self.masked_image_key = masked_image_key + assert self.masked_image_key in concat_keys + + @torch.no_grad() + def get_input(self, + batch, + k, + cond_key=None, + bs=None, + return_first_stage_outputs=False): + # note: restricted to non-trainable encoders currently + assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for inpainting' + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs) + + assert exists(self.concat_keys) + c_cat = list() + for ck in self.concat_keys: + cc = rearrange(batch[ck], 'b h w c -> b c h w').to( + memory_format=torch.contiguous_format).float() + if bs is not None: + cc = cc[:bs] + cc = cc.to(self.device) + bchw = z.shape + if ck != self.masked_image_key: + cc = torch.nn.functional.interpolate(cc, size=bchw[-2:]) + else: + cc = self.get_first_stage_encoding(self.encode_first_stage(cc)) + c_cat.append(cc) + c_cat = torch.cat(c_cat, dim=1) + all_conds = {'c_concat': [c_cat], 'c_crossattn': [c]} + if return_first_stage_outputs: + return z, all_conds, x, xrec, xc + return z, all_conds + + @torch.no_grad() + def log_images(self, *args, **kwargs): + log = super(LatentInpaintDiffusion, self).log_images(*args, **kwargs) + log['masked_image'] = rearrange( + args[0]['masked_image'], 'b h w c -> b c h w').to( + memory_format=torch.contiguous_format).float() + return log + + +class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion): + """ + condition on monocular depth estimation + """ + + def __init__(self, + depth_stage_config, + concat_keys=('midas_in', ), + *args, + **kwargs): + super().__init__(concat_keys=concat_keys, *args, **kwargs) + self.depth_model = instantiate_from_config(depth_stage_config) + self.depth_stage_key = concat_keys[0] + + @torch.no_grad() + def get_input(self, + batch, + k, + cond_key=None, + bs=None, + return_first_stage_outputs=False): + # note: restricted to non-trainable encoders currently + assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for depth2img' + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs) + + assert exists(self.concat_keys) + assert len(self.concat_keys) == 1 + c_cat = list() + for ck in self.concat_keys: + cc = batch[ck] + if bs is not None: + cc = cc[:bs] + cc = cc.to(self.device) + cc = self.depth_model(cc) + cc = torch.nn.functional.interpolate( + cc, + size=z.shape[2:], + mode='bicubic', + align_corners=False, + ) + + depth_min, depth_max = torch.amin( + cc, dim=[1, 2, 3], keepdim=True), torch.amax( + cc, dim=[1, 2, 3], keepdim=True) + cc = 2. * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1. + c_cat.append(cc) + c_cat = torch.cat(c_cat, dim=1) + all_conds = {'c_concat': [c_cat], 'c_crossattn': [c]} + if return_first_stage_outputs: + return z, all_conds, x, xrec, xc + return z, all_conds + + @torch.no_grad() + def log_images(self, *args, **kwargs): + log = super().log_images(*args, **kwargs) + depth = self.depth_model(args[0][self.depth_stage_key]) + depth_min, depth_max = torch.amin(depth, dim=[1, 2, 3], keepdim=True), \ + torch.amax(depth, dim=[1, 2, 3], keepdim=True) + log['depth'] = 2. * (depth - depth_min) / (depth_max - depth_min) - 1. + return log + + +class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion): + """ + condition on low-res image (and optionally on some spatial noise augmentation) + """ + + def __init__(self, + concat_keys=('lr', ), + reshuffle_patch_size=None, + low_scale_config=None, + low_scale_key=None, + *args, + **kwargs): + super().__init__(concat_keys=concat_keys, *args, **kwargs) + self.reshuffle_patch_size = reshuffle_patch_size + self.low_scale_model = None + if low_scale_config is not None: + print('Initializing a low-scale model') + assert exists(low_scale_key) + self.instantiate_low_stage(low_scale_config) + self.low_scale_key = low_scale_key + + def instantiate_low_stage(self, config): + model = instantiate_from_config(config) + self.low_scale_model = model.eval() + self.low_scale_model.train = disabled_train + for param in self.low_scale_model.parameters(): + param.requires_grad = False + + @torch.no_grad() + def get_input(self, + batch, + k, + cond_key=None, + bs=None, + return_first_stage_outputs=False): + # note: restricted to non-trainable encoders currently + assert not self.cond_stage_trainable, 'trainable cond stages not yet supported for upscaling-ft' + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs) + + assert exists(self.concat_keys) + assert len(self.concat_keys) == 1 + # optionally make spatial noise_level here + c_cat = list() + noise_level = None + for ck in self.concat_keys: + cc = batch[ck] + cc = rearrange(cc, 'b h w c -> b c h w') + if exists(self.reshuffle_patch_size): + assert isinstance(self.reshuffle_patch_size, int) + cc = rearrange( + cc, + 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w', + p1=self.reshuffle_patch_size, + p2=self.reshuffle_patch_size) + if bs is not None: + cc = cc[:bs] + cc = cc.to(self.device) + if exists(self.low_scale_model) and ck == self.low_scale_key: + cc, noise_level = self.low_scale_model(cc) + c_cat.append(cc) + c_cat = torch.cat(c_cat, dim=1) + if exists(noise_level): + all_conds = { + 'c_concat': [c_cat], + 'c_crossattn': [c], + 'c_adm': noise_level + } + else: + all_conds = {'c_concat': [c_cat], 'c_crossattn': [c]} + if return_first_stage_outputs: + return z, all_conds, x, xrec, xc + return z, all_conds + + @torch.no_grad() + def log_images(self, *args, **kwargs): + log = super().log_images(*args, **kwargs) + log['lr'] = rearrange(args[0]['lr'], 'b h w c -> b c h w') + return log diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/plms.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/plms.py new file mode 100644 index 000000000..f92d5feb0 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/plms.py @@ -0,0 +1,328 @@ +"""SAMPLING ONLY.""" + +from functools import partial + +import numpy as np +import torch +from tqdm import tqdm + +from ....ldm.models.diffusion.sampling_util import norm_thresholding +from ....ldm.modules.diffusionmodules.util import ( + make_ddim_sampling_parameters, make_ddim_timesteps, noise_like) + + +class PLMSSampler(object): + + def __init__(self, model, schedule='linear', **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device('cuda'): + attr = attr.to(torch.device('cuda')) + setattr(self, name, attr) + + def make_schedule(self, + ddim_num_steps, + ddim_discretize='uniform', + ddim_eta=0., + verbose=True): + if ddim_eta != 0: + raise ValueError('ddim_eta must be 0 for PLMS') + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[ + 0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + + def to_torch(x): + return x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', + to_torch(self.model.alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', + to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', + to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', + np.sqrt(1. - ddim_alphas)) + tmp1 = (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) + tmp2 = (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(tmp1 * tmp2) + self.register_buffer('ddim_sigmas_for_original_num_steps', + sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample( + self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + dynamic_threshold=None, + **kwargs): + if conditioning is not None: + if isinstance(conditioning, dict): + cbs = conditioning[list(conditioning.keys())[0]].shape[0] + if cbs != batch_size: + print( + f'Warning: Got {cbs} conditionings but batch-size is {batch_size}' + ) + else: + if conditioning.shape[0] != batch_size: + print( + f'Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}' + ) + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f'Data shape for PLMS sampling is {size}') + + samples, intermediates = self.plms_sampling( + conditioning, + size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, + x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ) + return samples, intermediates + + @torch.no_grad() + def plms_sampling(self, + cond, + shape, + x_T=None, + ddim_use_original_steps=False, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + log_every_t=100, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + dynamic_threshold=None): + device = self.model.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int( + min(timesteps / self.ddim_timesteps.shape[0], 1) + * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {'x_inter': [img], 'pred_x0': [img]} + time_range = list(reversed(range( + 0, timesteps))) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[ + 0] + print(f'Running PLMS Sampling with {total_steps} timesteps') + + iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps) + old_eps = [] + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b, ), step, device=device, dtype=torch.long) + ts_next = torch.full((b, ), + time_range[min(i + 1, + len(time_range) - 1)], + device=device, + dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample( + x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img + + outs = self.p_sample_plms( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + old_eps=old_eps, + t_next=ts_next, + dynamic_threshold=dynamic_threshold) + img, pred_x0, e_t = outs + old_eps.append(e_t) + if len(old_eps) >= 4: + old_eps.pop(0) + if callback: + callback(i) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(img) + intermediates['pred_x0'].append(pred_x0) + + return img, intermediates + + @torch.no_grad() + def p_sample_plms(self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + old_eps=None, + t_next=None, + dynamic_threshold=None): + b, *_, device = *x.shape, x.device + + def get_model_output(x, t): + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + e_t = self.model.apply_model(x, t, c) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t] * 2) + c_in = torch.cat([unconditional_conditioning, c]) + e_t_uncond, e_t = self.model.apply_model(x_in, t_in, + c_in).chunk(2) + e_t = e_t_uncond + unconditional_guidance_scale * ( + e_t - e_t_uncond) + + if score_corrector is not None: + assert self.model.parameterization == 'eps' + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, + **corrector_kwargs) + + return e_t + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod \ + if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + + def get_x_prev_and_pred_x0(e_t, index): + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), + alphas_prev[index], + device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full((b, 1, 1, 1), + sqrt_one_minus_alphas[index], + device=device) + + # current prediction for x_0 + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + if dynamic_threshold is not None: + pred_x0 = norm_thresholding(pred_x0, dynamic_threshold) + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, + repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + e_t = get_model_output(x, t) + if len(old_eps) == 0: + # Pseudo Improved Euler (2nd order) + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index) + e_t_next = get_model_output(x_prev, t_next) + e_t_prime = (e_t + e_t_next) / 2 + elif len(old_eps) == 1: + # 2nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (3 * e_t - old_eps[-1]) / 2 + elif len(old_eps) == 2: + # 3nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12 + elif len(old_eps) >= 3: + # 4nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] + - 9 * old_eps[-3]) / 24 + + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index) + + return x_prev, pred_x0, e_t diff --git a/modelscope/models/cv/anydoor/ldm/models/diffusion/sampling_util.py b/modelscope/models/cv/anydoor/ldm/models/diffusion/sampling_util.py new file mode 100644 index 000000000..52cfabed8 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/models/diffusion/sampling_util.py @@ -0,0 +1,25 @@ +import numpy as np +import torch + + +def append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions. + From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError( + f'input has {x.ndim} dims but target_dims is {target_dims}, which is less' + ) + return x[(..., ) + (None, ) * dims_to_append] + + +def norm_thresholding(x0, value): + s = append_dims( + x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim) + return x0 * (value / s) + + +def spatial_norm_thresholding(x0, value): + # b c h w + s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value) + return x0 * (value / s) diff --git a/modelscope/models/cv/anydoor/ldm/modules/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/modules/attention.py b/modelscope/models/cv/anydoor/ldm/modules/attention.py new file mode 100644 index 000000000..37921b866 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/attention.py @@ -0,0 +1,367 @@ +import math +# CrossAttn precision handling +import os +from inspect import isfunction +from typing import Any, Optional + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from torch import einsum, nn + +from ...ldm.modules.diffusionmodules.util import checkpoint + +try: + import xformers + import xformers.ops + XFORMERS_IS_AVAILABLE = True +except Exception: + XFORMERS_IS_AVAILABLE = False + +_ATTN_PRECISION = os.environ.get('ATTN_PRECISION', 'fp32') + + +def exists(val): + return val is not None + + +def uniq(arr): + return {el: True for el in arr}.keys() + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + + +def init_(tensor): + dim = tensor.shape[-1] + std = 1 / math.sqrt(dim) + tensor.uniform_(-std, std) + return tensor + + +# feedforward +class GEGLU(nn.Module): + + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = nn.Sequential(nn.Linear( + dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim) + + self.net = nn.Sequential(project_in, nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out)) + + def forward(self, x): + return self.net(x) + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def Normalize(in_channels): + return torch.nn.GroupNorm( + num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + + +class SpatialSelfAttention(nn.Module): + + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = rearrange(q, 'b c h w -> b (h w) c') + k = rearrange(k, 'b c h w -> b c (h w)') + w_ = torch.einsum('bij,bjk->bik', q, k) + + w_ = w_ * (int(c)**(-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = rearrange(v, 'b c h w -> b c (h w)') + w_ = rearrange(w_, 'b i j -> b j i') + h_ = torch.einsum('bij,bjk->bik', v, w_) + h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h) + h_ = self.proj_out(h_) + + return x + h_ + + +class CrossAttention(nn.Module): + + def __init__(self, + query_dim, + context_dim=None, + heads=8, + dim_head=64, + dropout=0.): + super().__init__() + inner_dim = dim_head * heads + context_dim = default(context_dim, query_dim) + + self.scale = dim_head**-0.5 + self.heads = heads + + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Sequential( + nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)) + + def forward(self, x, context=None, mask=None): + h = self.heads + + q = self.to_q(x) + context = default(context, x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), + (q, k, v)) + + # force cast to fp32 to avoid overflowing + if _ATTN_PRECISION == 'fp32': + with torch.autocast(enabled=False, device_type='cuda'): + q, k = q.float(), k.float() + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + else: + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + + del q, k + + if exists(mask): + mask = rearrange(mask, 'b ... -> b (...)') + max_neg_value = -torch.finfo(sim.dtype).max + mask = repeat(mask, 'b j -> (b h) () j', h=h) + sim.masked_fill_(~mask, max_neg_value) + + # attention, what we cannot get enough of + sim = sim.softmax(dim=-1) + + out = einsum('b i j, b j d -> b i d', sim, v) + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + return self.to_out(out) + + +class MemoryEfficientCrossAttention(nn.Module): + # https://github.com/MatthieuTPHR/diffusers/blob/d80b531ff8060ec1ea982b65a1b8df70f73aa67c/src/diffusers/models/attention.py#L223 + def __init__(self, + query_dim, + context_dim=None, + heads=8, + dim_head=64, + dropout=0.0): + super().__init__() + print( + f'Setting up {self.__class__.__name__}. Query dim is {query_dim}, context_dim is {context_dim} and using ' + f'{heads} heads.') + inner_dim = dim_head * heads + context_dim = default(context_dim, query_dim) + + self.heads = heads + self.dim_head = dim_head + + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Sequential( + nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)) + self.attention_op: Optional[Any] = None + + def forward(self, x, context=None, mask=None): + q = self.to_q(x) + context = default(context, x) + k = self.to_k(context) + v = self.to_v(context) + + b, _, _ = q.shape + q, k, v = map( + lambda t: t.unsqueeze(3).reshape(b, t.shape[ + 1], self.heads, self.dim_head).permute(0, 2, 1, 3).reshape( + b * self.heads, t.shape[1], self.dim_head).contiguous(), + (q, k, v), + ) + + # actually compute the attention, what we cannot get enough of + out = xformers.ops.memory_efficient_attention( + q, k, v, attn_bias=None, op=self.attention_op) + + if exists(mask): + raise NotImplementedError + out = ( + out.unsqueeze(0).reshape( + b, self.heads, out.shape[1], + self.dim_head).permute(0, 2, 1, + 3).reshape(b, out.shape[1], + self.heads * self.dim_head)) + return self.to_out(out) + + +class BasicTransformerBlock(nn.Module): + ATTENTION_MODES = { + 'softmax': CrossAttention, # vanilla attention + 'softmax-xformers': MemoryEfficientCrossAttention + } + + def __init__(self, + dim, + n_heads, + d_head, + dropout=0., + context_dim=None, + gated_ff=True, + checkpoint=True, + disable_self_attn=False): + super().__init__() + attn_mode = 'softmax-xformers' if XFORMERS_IS_AVAILABLE else 'softmax' + assert attn_mode in self.ATTENTION_MODES + attn_cls = self.ATTENTION_MODES[attn_mode] + self.disable_self_attn = disable_self_attn + self.attn1 = attn_cls( + query_dim=dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout, + context_dim=context_dim if self.disable_self_attn else + None) # is a self-attention if not self.disable_self_attn + self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) + self.attn2 = attn_cls( + query_dim=dim, + context_dim=context_dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout) # is self-attn if context is none + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + self.norm3 = nn.LayerNorm(dim) + self.checkpoint = checkpoint + + def forward(self, x, context=None): + return checkpoint(self._forward, (x, context), self.parameters(), + self.checkpoint) + + def _forward(self, x, context=None): + x = self.attn1( + self.norm1(x), + context=context if self.disable_self_attn else None) + x + x = self.attn2(self.norm2(x), context=context) + x + x = self.ff(self.norm3(x)) + x + return x + + +class SpatialTransformer(nn.Module): + """ + Transformer block for image-like data. + First, project the input (aka embedding) + and reshape to b, t, d. + Then apply standard transformer action. + Finally, reshape to image + NEW: use_linear for more efficiency instead of the 1x1 convs + """ + + def __init__(self, + in_channels, + n_heads, + d_head, + depth=1, + dropout=0., + context_dim=None, + disable_self_attn=False, + use_linear=False, + use_checkpoint=True): + super().__init__() + if exists(context_dim) and not isinstance(context_dim, list): + context_dim = [context_dim] + self.in_channels = in_channels + inner_dim = n_heads * d_head + self.norm = Normalize(in_channels) + if not use_linear: + self.proj_in = nn.Conv2d( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + else: + self.proj_in = nn.Linear(in_channels, inner_dim) + + self.transformer_blocks = nn.ModuleList([ + BasicTransformerBlock( + inner_dim, + n_heads, + d_head, + dropout=dropout, + context_dim=context_dim[d], + disable_self_attn=disable_self_attn, + checkpoint=use_checkpoint) for d in range(depth) + ]) + if not use_linear: + self.proj_out = zero_module( + nn.Conv2d( + inner_dim, in_channels, kernel_size=1, stride=1, + padding=0)) + else: + self.proj_out = zero_module(nn.Linear(in_channels, inner_dim)) + self.use_linear = use_linear + + def forward(self, x, context=None): + # note: if no context is given, cross-attention defaults to self-attention + if not isinstance(context, list): + context = [context] + b, c, h, w = x.shape + x_in = x + x = self.norm(x) + if not self.use_linear: + x = self.proj_in(x) + x = rearrange(x, 'b c h w -> b (h w) c').contiguous() + if self.use_linear: + x = self.proj_in(x) + for i, block in enumerate(self.transformer_blocks): + x = block(x, context=context[i]) + if self.use_linear: + x = self.proj_out(x) + x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous() + if not self.use_linear: + x = self.proj_out(x) + return x + x_in diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/model.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/model.py new file mode 100644 index 000000000..77b2f3826 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/model.py @@ -0,0 +1,966 @@ +# pytorch_diffusion + derived encoder decoder +import math +from typing import Any, Optional + +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange + +from ....ldm.modules.attention import MemoryEfficientCrossAttention + +try: + import xformers + import xformers.ops + XFORMERS_IS_AVAILABLE = True +except Exception: + XFORMERS_IS_AVAILABLE = False + print("No module 'xformers'. Proceeding without it.") + + +def get_timestep_embedding(timesteps, embedding_dim): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: + From Fairseq. + Build sinusoidal embeddings. + This matches the implementation in tensor2tensor, but differs slightly + from the description in Section 3.5 of "Attention Is All You Need". + """ + assert len(timesteps.shape) == 1 + + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb) + emb = emb.to(device=timesteps.device) + emb = timesteps.float()[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) + return emb + + +def nonlinearity(x): + # swish + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32): + return torch.nn.GroupNorm( + num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + + +class Upsample(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate( + x, scale_factor=2.0, mode='nearest') + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode='constant', value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class ResnetBlock(nn.Module): + + def __init__(self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels) + self.conv1 = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class AttnBlock(nn.Module): + + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = q.reshape(b, c, h * w) + q = q.permute(0, 2, 1) # b,hw,c + k = k.reshape(b, c, h * w) # b,c,hw + w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j] + w_ = w_ * (int(c)**(-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = v.reshape(b, c, h * w) + w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q) + h_ = torch.bmm( + v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j] + h_ = h_.reshape(b, c, h, w) + + h_ = self.proj_out(h_) + + return x + h_ + + +class MemoryEfficientAttnBlock(nn.Module): + """ + Uses xformers efficient implementation, + Note: this is a single-head self-attention operation + """ + + # + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.attention_op: Optional[Any] = None + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + B, C, H, W = q.shape + q, k, v = map(lambda x: rearrange(x, 'b c h w -> b (h w) c'), + (q, k, v)) + + q, k, v = map( + lambda t: t.unsqueeze(3).reshape(B, t.shape[1], 1, C).permute( + 0, 2, 1, 3).reshape(B * 1, t.shape[1], C).contiguous(), + (q, k, v), + ) + out = xformers.ops.memory_efficient_attention( + q, k, v, attn_bias=None, op=self.attention_op) + + out = ( + out.unsqueeze(0).reshape(B, 1, out.shape[1], + C).permute(0, 2, 1, + 3).reshape(B, out.shape[1], C)) + out = rearrange(out, 'b (h w) c -> b c h w', b=B, h=H, w=W, c=C) + out = self.proj_out(out) + return x + out + + +class MemoryEfficientCrossAttentionWrapper(MemoryEfficientCrossAttention): + + def forward(self, x, context=None, mask=None): + b, c, h, w = x.shape + x = rearrange(x, 'b c h w -> b (h w) c') + out = super().forward(x, context=context, mask=mask) + out = rearrange(out, 'b (h w) c -> b c h w', h=h, w=w, c=c) + return x + out + + +def make_attn(in_channels, attn_type='vanilla', attn_kwargs=None): + assert attn_type in [ + 'vanilla', 'vanilla-xformers', 'memory-efficient-cross-attn', 'linear', + 'none' + ], f'attn_type {attn_type} unknown' + if XFORMERS_IS_AVAILABLE and attn_type == 'vanilla': + attn_type = 'vanilla-xformers' + print( + f"making attention of type '{attn_type}' with {in_channels} in_channels" + ) + if attn_type == 'vanilla': + assert attn_kwargs is None + return AttnBlock(in_channels) + elif attn_type == 'vanilla-xformers': + print( + f'building MemoryEfficientAttnBlock with {in_channels} in_channels...' + ) + return MemoryEfficientAttnBlock(in_channels) + elif type == 'memory-efficient-cross-attn': + attn_kwargs['query_dim'] = in_channels + return MemoryEfficientCrossAttentionWrapper(**attn_kwargs) + elif attn_type == 'none': + return nn.Identity(in_channels) + else: + raise NotImplementedError() + + +class Model(nn.Module): + + def __init__(self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + use_timestep=True, + use_linear_attn=False, + attn_type='vanilla'): + super().__init__() + if use_linear_attn: + attn_type = 'linear' + self.ch = ch + self.temb_ch = self.ch * 4 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + self.use_timestep = use_timestep + if self.use_timestep: + # timestep embedding + self.temb = nn.Module() + self.temb.dense = nn.ModuleList([ + torch.nn.Linear(self.ch, self.temb_ch), + torch.nn.Linear(self.temb_ch, self.temb_ch), + ]) + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_ch_mult = (1, ) + tuple(ch_mult) + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + skip_in = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + if i_block == self.num_res_blocks: + skip_in = ch * in_ch_mult[i_level] + block.append( + ResnetBlock( + in_channels=block_in + skip_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1) + + def forward(self, x, t=None, context=None): + # assert x.shape[2] == x.shape[3] == self.resolution + if context is not None: + # assume aligned context, cat along channel axis + x = torch.cat((x, context), dim=1) + if self.use_timestep: + # timestep embedding + assert t is not None + temb = get_timestep_embedding(t, self.ch) + temb = self.temb.dense[0](temb) + temb = nonlinearity(temb) + temb = self.temb.dense[1](temb) + else: + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()], + dim=1), temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + def get_last_layer(self): + return self.conv_out.weight + + +class Encoder(nn.Module): + + def __init__(self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + z_channels, + double_z=True, + use_linear_attn=False, + attn_type='vanilla', + **ignore_kwargs): + super().__init__() + if use_linear_attn: + attn_type = 'linear' + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_ch_mult = (1, ) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x): + # timestep embedding + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder(nn.Module): + + def __init__(self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + z_channels, + give_pre_end=False, + tanh_out=False, + use_linear_attn=False, + attn_type='vanilla', + **ignorekwargs): + super().__init__() + if use_linear_attn: + attn_type = 'linear' + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2**(self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print('Working with z of shape {} = {} dimensions.'.format( + self.z_shape, np.prod(self.z_shape))) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + if self.tanh_out: + h = torch.tanh(h) + return h + + +class SimpleDecoder(nn.Module): + + def __init__(self, in_channels, out_channels, *args, **kwargs): + super().__init__() + self.model = nn.ModuleList([ + nn.Conv2d(in_channels, in_channels, 1), + ResnetBlock( + in_channels=in_channels, + out_channels=2 * in_channels, + temb_channels=0, + dropout=0.0), + ResnetBlock( + in_channels=2 * in_channels, + out_channels=4 * in_channels, + temb_channels=0, + dropout=0.0), + ResnetBlock( + in_channels=4 * in_channels, + out_channels=2 * in_channels, + temb_channels=0, + dropout=0.0), + nn.Conv2d(2 * in_channels, in_channels, 1), + Upsample(in_channels, with_conv=True) + ]) + # end + self.norm_out = Normalize(in_channels) + self.conv_out = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + for i, layer in enumerate(self.model): + if i in [1, 2, 3]: + x = layer(x, None) + else: + x = layer(x) + + h = self.norm_out(x) + h = nonlinearity(h) + x = self.conv_out(h) + return x + + +class UpsampleDecoder(nn.Module): + + def __init__(self, + in_channels, + out_channels, + ch, + num_res_blocks, + resolution, + ch_mult=(2, 2), + dropout=0.0): + super().__init__() + # upsampling + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + block_in = in_channels + curr_res = resolution // 2**(self.num_resolutions - 1) + self.res_blocks = nn.ModuleList() + self.upsample_blocks = nn.ModuleList() + for i_level in range(self.num_resolutions): + res_block = [] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + res_block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + self.res_blocks.append(nn.ModuleList(res_block)) + if i_level != self.num_resolutions - 1: + self.upsample_blocks.append(Upsample(block_in, True)) + curr_res = curr_res * 2 + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + # upsampling + h = x + for k, i_level in enumerate(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.res_blocks[i_level][i_block](h, None) + if i_level != self.num_resolutions - 1: + h = self.upsample_blocks[k](h) + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class LatentRescaler(nn.Module): + + def __init__(self, + factor, + in_channels, + mid_channels, + out_channels, + depth=2): + super().__init__() + # residual block, interpolate, residual block + self.factor = factor + self.conv_in = nn.Conv2d( + in_channels, mid_channels, kernel_size=3, stride=1, padding=1) + self.res_block1 = nn.ModuleList([ + ResnetBlock( + in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0) for _ in range(depth) + ]) + self.attn = AttnBlock(mid_channels) + self.res_block2 = nn.ModuleList([ + ResnetBlock( + in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0) for _ in range(depth) + ]) + + self.conv_out = nn.Conv2d( + mid_channels, + out_channels, + kernel_size=1, + ) + + def forward(self, x): + x = self.conv_in(x) + for block in self.res_block1: + x = block(x, None) + x = torch.nn.functional.interpolate( + x, + size=(int(round(x.shape[2] * self.factor)), + int(round(x.shape[3] * self.factor)))) + x = self.attn(x) + for block in self.res_block2: + x = block(x, None) + x = self.conv_out(x) + return x + + +class MergedRescaleEncoder(nn.Module): + + def __init__(self, + in_channels, + ch, + resolution, + out_ch, + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + ch_mult=(1, 2, 4, 8), + rescale_factor=1.0, + rescale_module_depth=1): + super().__init__() + intermediate_chn = ch * ch_mult[-1] + self.encoder = Encoder( + in_channels=in_channels, + num_res_blocks=num_res_blocks, + ch=ch, + ch_mult=ch_mult, + z_channels=intermediate_chn, + double_z=False, + resolution=resolution, + attn_resolutions=attn_resolutions, + dropout=dropout, + resamp_with_conv=resamp_with_conv, + out_ch=None) + self.rescaler = LatentRescaler( + factor=rescale_factor, + in_channels=intermediate_chn, + mid_channels=intermediate_chn, + out_channels=out_ch, + depth=rescale_module_depth) + + def forward(self, x): + x = self.encoder(x) + x = self.rescaler(x) + return x + + +class MergedRescaleDecoder(nn.Module): + + def __init__(self, + z_channels, + out_ch, + resolution, + num_res_blocks, + attn_resolutions, + ch, + ch_mult=(1, 2, 4, 8), + dropout=0.0, + resamp_with_conv=True, + rescale_factor=1.0, + rescale_module_depth=1): + super().__init__() + tmp_chn = z_channels * ch_mult[-1] + self.decoder = Decoder( + out_ch=out_ch, + z_channels=tmp_chn, + attn_resolutions=attn_resolutions, + dropout=dropout, + resamp_with_conv=resamp_with_conv, + in_channels=None, + num_res_blocks=num_res_blocks, + ch_mult=ch_mult, + resolution=resolution, + ch=ch) + self.rescaler = LatentRescaler( + factor=rescale_factor, + in_channels=z_channels, + mid_channels=tmp_chn, + out_channels=tmp_chn, + depth=rescale_module_depth) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Upsampler(nn.Module): + + def __init__(self, + in_size, + out_size, + in_channels, + out_channels, + ch_mult=2): + super().__init__() + assert out_size >= in_size + num_blocks = int(np.log2(out_size // in_size)) + 1 + factor_up = 1. + (out_size % in_size) + print( + f'Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}' + ) + self.rescaler = LatentRescaler( + factor=factor_up, + in_channels=in_channels, + mid_channels=2 * in_channels, + out_channels=in_channels) + self.decoder = Decoder( + out_ch=out_channels, + resolution=out_size, + z_channels=in_channels, + num_res_blocks=2, + attn_resolutions=[], + in_channels=None, + ch=in_channels, + ch_mult=[ch_mult for _ in range(num_blocks)]) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Resize(nn.Module): + + def __init__(self, in_channels=None, learned=False, mode='bilinear'): + super().__init__() + self.with_conv = learned + self.mode = mode + if self.with_conv: + print( + f'Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode' + ) + raise NotImplementedError() + assert in_channels is not None + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=4, stride=2, padding=1) + + def forward(self, x, scale_factor=1.0): + if scale_factor == 1.0: + return x + else: + x = torch.nn.functional.interpolate( + x, + mode=self.mode, + align_corners=False, + scale_factor=scale_factor) + return x diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/openaimodel.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/openaimodel.py new file mode 100644 index 000000000..afe1b864b --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/openaimodel.py @@ -0,0 +1,820 @@ +import math +from abc import abstractmethod + +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from ....ldm.modules.attention import SpatialTransformer +from ....ldm.modules.diffusionmodules.util import (avg_pool_nd, checkpoint, + conv_nd, linear, + normalization, + timestep_embedding, + zero_module) +from ....ldm.util import exists + + +# dummy replace +def convert_module_to_f16(x): + pass + + +def convert_module_to_f32(x): + pass + + +# go +class AttentionPool2d(nn.Module): + """ + Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py + """ + + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + self.positional_embedding = nn.Parameter( + th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + x = x.reshape(b, c, -1) # NC(HW) + x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) + x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] + + +class TimestepBlock(nn.Module): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb, context=None): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + elif isinstance(layer, SpatialTransformer): + x = layer(x, context) + else: + x = layer(x) + return x + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, + channels, + use_conv, + dims=2, + out_channels=None, + padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd( + dims, self.channels, self.out_channels, 3, padding=padding) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate( + x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), + mode='nearest') + else: + x = F.interpolate(x, scale_factor=2, mode='nearest') + if self.use_conv: + x = self.conv(x) + return x + + +class TransposedUpsample(nn.Module): + 'Learned 2x upsampling without padding' + + def __init__(self, channels, out_channels=None, ks=5): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + + self.up = nn.ConvTranspose2d( + self.channels, self.out_channels, kernel_size=ks, stride=2) + + def forward(self, x): + return self.up(x) + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, + channels, + use_conv, + dims=2, + out_channels=None, + padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, + self.channels, + self.out_channels, + 3, + stride=stride, + padding=padding) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear( + emb_channels, + 2 * self.out_channels + if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + conv_nd( + dims, self.out_channels, self.out_channels, 3, padding=1)), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd( + dims, channels, self.out_channels, 3, padding=1) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, + 1) + + def forward(self, x, emb): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + return checkpoint(self._forward, (x, emb), self.parameters(), + self.use_checkpoint) + + def _forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = th.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f'q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}' + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return checkpoint( + self._forward, (x, ), self.parameters(), True + ) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!! + # return pt_checkpoint(self._forward, x) # pytorch + + def _forward(self, x): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial**2) * c + model.total_ops += th.DoubleTensor([matmul_ops]) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/output heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split( + ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + 'bct,bcs->bts', q * scale, + k * scale) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum('bts,bcs->bct', weight, v) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Module): + """ + A module which performs QKV attention and splits in a different order. + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + 'bct,bcs->bts', + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum('bts,bcs->bct', weight, + v.reshape(bs * self.n_heads, ch, length)) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class UNetModel(nn.Module): + """ + The full UNet model with attention and timestep embedding. + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially + increased efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + use_spatial_transformer=False, # custom transformer support + transformer_depth=1, # custom transformer support + context_dim=None, # custom transformer support + n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model + legacy=True, + disable_self_attentions=None, + num_attention_blocks=None, + disable_middle_self_attn=False, + use_linear_in_transformer=False, + ): + super().__init__() + if use_spatial_transformer: + assert context_dim is not None + + if context_dim is not None: + assert use_spatial_transformer + from omegaconf.listconfig import ListConfig + if type(context_dim) == ListConfig: + context_dim = list(context_dim) + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + if num_heads == -1: + assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' + + if num_head_channels == -1: + assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + if isinstance(num_res_blocks, int): + self.num_res_blocks = len(channel_mult) * [num_res_blocks] + else: + if len(num_res_blocks) != len(channel_mult): + raise ValueError( + 'provide num_res_blocks either as an int (globally constant) or ' + 'as a list/tuple (per-level) with the same length as channel_mult' + ) + self.num_res_blocks = num_res_blocks + if disable_self_attentions is not None: + # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not + assert len(disable_self_attentions) == len(channel_mult) + if num_attention_blocks is not None: + assert len(num_attention_blocks) == len(self.num_res_blocks) + assert all( + map( + lambda i: self.num_res_blocks[i] >= num_attention_blocks[i + ], + range(len(num_attention_blocks)))) + print( + f'Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. ' + f'This option has LESS priority than attention_resolutions {attention_resolutions}, ' + f'i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, ' + f'attention will still not be set.') + + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.predict_codebook_ids = n_embed is not None + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + if self.num_classes is not None: + if isinstance(self.num_classes, int): + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + elif self.num_classes == 'continuous': + print('setting up linear c_adm embedding layer') + self.label_emb = nn.Linear(1, time_embed_dim) + else: + raise ValueError() + + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1)) + ]) + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for nr in range(self.num_res_blocks[level]): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks + ) or nr < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disabled_sa, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint)) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) if resblock_updown else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch)) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( # always uses a self-attn + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disable_middle_self_attn, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(self.num_res_blocks[level] + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim, + dropout, + out_channels=model_channels * mult, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = model_channels * mult + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks + ) or i < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disabled_sa, + use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint)) + if level and i == self.num_res_blocks[level]: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) if resblock_updown else Upsample( + ch, conv_resample, dims=dims, out_channels=out_ch)) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module( + conv_nd(dims, model_channels, out_channels, 3, padding=1)), + ) + if self.predict_codebook_ids: + self.id_predictor = nn.Sequential( + normalization(ch), + conv_nd(dims, model_channels, n_embed, 1), + # nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits + ) + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + self.output_blocks.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """ + Convert the torso of the model to float32. + """ + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + self.output_blocks.apply(convert_module_to_f32) + + def forward(self, x, timesteps=None, context=None, y=None, **kwargs): + """ + Apply the model to an input batch. + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param context: conditioning plugged in via crossattn + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + assert (y is not None) == ( + self.num_classes is not None + ), 'must specify y if and only if the model is class-conditional' + hs = [] + t_emb = timestep_embedding( + timesteps, self.model_channels, repeat_only=False) + emb = self.time_embed(t_emb) + + if self.num_classes is not None: + assert y.shape[0] == x.shape[0] + emb = emb + self.label_emb(y) + + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb, context) + hs.append(h) + h = self.middle_block(h, emb, context) + for module in self.output_blocks: + h = th.cat([h, hs.pop()], dim=1) + h = module(h, emb, context) + h = h.type(x.dtype) + if self.predict_codebook_ids: + return self.id_predictor(h) + else: + return self.out(h) diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/upscaling.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/upscaling.py new file mode 100644 index 000000000..bcc9d138f --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/upscaling.py @@ -0,0 +1,103 @@ +from functools import partial + +import numpy as np +import torch +import torch.nn as nn + +from ....ldm.modules.diffusionmodules.util import (extract_into_tensor, + make_beta_schedule) +from ....ldm.util import default + + +class AbstractLowScaleModel(nn.Module): + # for concatenating a downsampled image to the latent representation + def __init__(self, noise_schedule_config=None): + super(AbstractLowScaleModel, self).__init__() + if noise_schedule_config is not None: + self.register_schedule(**noise_schedule_config) + + def register_schedule(self, + beta_schedule='linear', + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3): + betas = make_beta_schedule( + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s) + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert alphas_cumprod.shape[ + 0] == self.num_timesteps, 'alphas have to be defined for each timestep' + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer('betas', to_torch(betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', + to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', + to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + to_torch(np.sqrt(1. - alphas_cumprod))) + self.register_buffer('log_one_minus_alphas_cumprod', + to_torch(np.log(1. - alphas_cumprod))) + self.register_buffer('sqrt_recip_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod - 1))) + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) + * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, + x_start.shape) * noise) + + def forward(self, x): + return x, None + + def decode(self, x): + return x + + +class SimpleImageConcat(AbstractLowScaleModel): + # no noise level conditioning + def __init__(self): + super(SimpleImageConcat, self).__init__(noise_schedule_config=None) + self.max_noise_level = 0 + + def forward(self, x): + # fix to constant noise level + return x, torch.zeros(x.shape[0], device=x.device).long() + + +class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel): + + def __init__(self, + noise_schedule_config, + max_noise_level=1000, + to_cuda=False): + super().__init__(noise_schedule_config=noise_schedule_config) + self.max_noise_level = max_noise_level + + def forward(self, x, noise_level=None): + if noise_level is None: + noise_level = torch.randint( + 0, self.max_noise_level, (x.shape[0], ), + device=x.device).long() + else: + assert isinstance(noise_level, torch.Tensor) + z = self.q_sample(x, noise_level) + return z, noise_level diff --git a/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/util.py b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/util.py new file mode 100644 index 000000000..d48ea5f52 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/diffusionmodules/util.py @@ -0,0 +1,310 @@ +# adopted from +# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py +# and +# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py +# and +# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py +# +# thanks! + +import math +import os + +import numpy as np +import torch +import torch.nn as nn +from einops import repeat + +from ....ldm.util import instantiate_from_config + + +def make_beta_schedule(schedule, + n_timestep, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3): + if schedule == 'linear': + betas = ( + torch.linspace( + linear_start**0.5, + linear_end**0.5, + n_timestep, + dtype=torch.float64)**2) + + elif schedule == 'cosine': + timesteps = ( + torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + + cosine_s) + alphas = timesteps / (1 + cosine_s) * np.pi / 2 + alphas = torch.cos(alphas).pow(2) + alphas = alphas / alphas[0] + betas = 1 - alphas[1:] / alphas[:-1] + betas = np.clip(betas, a_min=0, a_max=0.999) + + elif schedule == 'sqrt_linear': + betas = torch.linspace( + linear_start, linear_end, n_timestep, dtype=torch.float64) + elif schedule == 'sqrt': + betas = torch.linspace( + linear_start, linear_end, n_timestep, dtype=torch.float64)**0.5 + else: + raise ValueError(f"schedule '{schedule}' unknown.") + return betas.numpy() + + +def make_ddim_timesteps(ddim_discr_method, + num_ddim_timesteps, + num_ddpm_timesteps, + verbose=True): + if ddim_discr_method == 'uniform': + c = num_ddpm_timesteps // num_ddim_timesteps + ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) + elif ddim_discr_method == 'quad': + ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), + num_ddim_timesteps))**2).astype(int) + else: + raise NotImplementedError( + f'There is no ddim discretization method called "{ddim_discr_method}"' + ) + + # assert ddim_timesteps.shape[0] == num_ddim_timesteps + # add one to get the final alpha values right (the ones from first scale to data during sampling) + steps_out = ddim_timesteps + 1 + if verbose: + print(f'Selected timesteps for ddim sampler: {steps_out}') + return steps_out + + +def make_ddim_sampling_parameters(alphacums, + ddim_timesteps, + eta, + verbose=True): + # select alphas for computing the variance schedule + alphas = alphacums[ddim_timesteps] + alphas_prev = np.asarray([alphacums[0]] + + alphacums[ddim_timesteps[:-1]].tolist()) + + # according the the formula provided in https://arxiv.org/abs/2010.02502 + tmp = (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev) + sigmas = eta * np.sqrt(tmp) + if verbose: + print( + f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}' + ) + print( + f'For the chosen value of eta, which is {eta}, ' + f'this results in the following sigma_t schedule for ddim sampler {sigmas}' + ) + return sigmas, alphas, alphas_prev + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(-1, t) + return out.reshape(b, *((1, ) * (len(x_shape) - 1))) + + +def checkpoint(func, inputs, params, flag): + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + ctx.gpu_autocast_kwargs = { + 'enabled': torch.is_autocast_enabled(), + 'dtype': torch.get_autocast_gpu_dtype(), + 'cache_enabled': torch.is_autocast_cache_enabled() + } + with torch.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [ + x.detach().requires_grad_(True) for x in ctx.input_tensors + ] + with torch.enable_grad(), \ + torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = torch.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads + + +def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): + """ + Create sinusoidal timestep embeddings. + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + if not repeat_only: + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) + * torch.arange(start=0, end=half, dtype=torch.float32) + / half).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat( + [embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + else: + embedding = repeat(timesteps, 'b -> b d', d=dim) + return embedding + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """ + Make a standard normalization layer. + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +# PyTorch 1.7 has SiLU, but we support PyTorch 1.5. +class SiLU(nn.Module): + + def forward(self, x): + return x * torch.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f'unsupported dimensions: {dims}') + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f'unsupported dimensions: {dims}') + + +class HybridConditioner(nn.Module): + + def __init__(self, c_concat_config, c_crossattn_config): + super().__init__() + self.concat_conditioner = instantiate_from_config(c_concat_config) + self.crossattn_conditioner = instantiate_from_config( + c_crossattn_config) + + def forward(self, c_concat, c_crossattn): + c_concat = self.concat_conditioner(c_concat) + c_crossattn = self.crossattn_conditioner(c_crossattn) + return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]} + + +def noise_like(shape, device, repeat=False): + + def repeat_noise(): + torch.randn( + (1, *shape[1:]), device=device).repeat(shape[0], + *((1, ) * (len(shape) - 1))) + + noise = lambda: torch.randn(shape, device=device) # noqa + return repeat_noise() if repeat else noise() diff --git a/modelscope/models/cv/anydoor/ldm/modules/distributions/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/distributions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/modules/distributions/distributions.py b/modelscope/models/cv/anydoor/ldm/modules/distributions/distributions.py new file mode 100644 index 000000000..dd094d532 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/distributions/distributions.py @@ -0,0 +1,93 @@ +import numpy as np +import torch + + +class AbstractDistribution: + + def sample(self): + raise NotImplementedError() + + def mode(self): + raise NotImplementedError() + + +class DiracDistribution(AbstractDistribution): + + def __init__(self, value): + self.value = value + + def sample(self): + return self.value + + def mode(self): + return self.value + + +class DiagonalGaussianDistribution(object): + + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like( + self.mean).to(device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn( + self.mean.shape).to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * torch.sum( + torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3]) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=[1, 2, 3]) + + def nll(self, sample, dims=[1, 2, 3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, 'at least one argument must be a Tensor' + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + tmp = ((mean1 - mean2)**2) * torch.exp(-logvar2) + return 0.5 * (-1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + + tmp) diff --git a/modelscope/models/cv/anydoor/ldm/modules/ema.py b/modelscope/models/cv/anydoor/ldm/modules/ema.py new file mode 100644 index 000000000..a1167fe70 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/ema.py @@ -0,0 +1,87 @@ +import torch +from torch import nn + + +class LitEma(nn.Module): + + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError('Decay must be between 0 and 1') + + self.m_name2s_name = {} + self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) + self.register_buffer( + 'num_updates', + torch.tensor(0, dtype=torch.int) + if use_num_upates else torch.tensor(-1, dtype=torch.int)) + + for name, p in model.named_parameters(): + if p.requires_grad: + # remove as '.'-character is not allowed in buffers + s_name = name.replace('.', '') + self.m_name2s_name.update({name: s_name}) + self.register_buffer(s_name, p.clone().detach().data) + + self.collected_params = [] + + def reset_num_updates(self): + del self.num_updates + self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int)) + + def forward(self, model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + tmp = (1 + self.num_updates) / (10 + self.num_updates) + decay = min(self.decay, tmp) + + one_minus_decay = 1.0 - decay + + with torch.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if m_param[key].requires_grad: + sname = self.m_name2s_name[key] + shadow_params[sname] = shadow_params[sname].type_as( + m_param[key]) + tmp = shadow_params[sname] - m_param[key] + shadow_params[sname].sub_(one_minus_decay * tmp) + else: + assert key not in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if m_param[key].requires_grad: + m_param[key].data.copy_( + shadow_params[self.m_name2s_name[key]].data) + else: + assert key not in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.data.copy_(c_param.data) diff --git a/modelscope/models/cv/anydoor/ldm/modules/encoders/__init__.py b/modelscope/models/cv/anydoor/ldm/modules/encoders/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/anydoor/ldm/modules/encoders/modules.py b/modelscope/models/cv/anydoor/ldm/modules/encoders/modules.py new file mode 100644 index 000000000..bfbfb78ea --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/modules/encoders/modules.py @@ -0,0 +1,371 @@ +import os + +import open_clip +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint +from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel, + T5Tokenizer) + +from ....dinov2 import hubconf +from ....ldm.util import count_params + + +class LayerNormFp32(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16 (by casting to float32 and back).""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + x = F.layer_norm( + x.to(torch.float32), self.normalized_shape, self.weight, self.bias, + self.eps) + return x.to(orig_type) + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm (with cast back to input dtype).""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, + self.eps) + return x.to(orig_type) + + +class AbstractEncoder(nn.Module): + + def __init__(self): + super().__init__() + + def encode(self, *args, **kwargs): + raise NotImplementedError + + +class IdentityEncoder(AbstractEncoder): + + def encode(self, x): + return x + + +class ClassEmbedder(nn.Module): + + def __init__(self, embed_dim, n_classes=1000, key='class', ucg_rate=0.1): + super().__init__() + self.key = key + self.embedding = nn.Embedding(n_classes, embed_dim) + self.n_classes = n_classes + self.ucg_rate = ucg_rate + + def forward(self, batch, key=None, disable_dropout=False): + if key is None: + key = self.key + # this is for use in crossattn + c = batch[key][:, None] + if self.ucg_rate > 0. and not disable_dropout: + mask = 1. - torch.bernoulli(torch.ones_like(c) * self.ucg_rate) + c = mask * c + (1 - mask) * torch.ones_like(c) * ( + self.n_classes - 1) + c = c.long() + c = self.embedding(c) + return c + + def get_unconditional_conditioning(self, bs, device='cuda'): + uc_class = self.n_classes - 1 # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000) + uc = torch.ones((bs, ), device=device) * uc_class + uc = {self.key: uc} + return uc + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +class FrozenT5Embedder(AbstractEncoder): + """Uses the T5 transformer encoder for text""" + + def __init__(self, + version='google/t5-v1_1-large', + device='cuda', + max_length=77, + freeze=True + ): # others are google/t5-v1_1-xl and google/t5-v1_1-xxl + super().__init__() + self.tokenizer = T5Tokenizer.from_pretrained(version) + self.transformer = T5EncoderModel.from_pretrained(version) + self.device = device + self.max_length = max_length # TODO: typical value? + if freeze: + self.freeze() + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding='max_length', + return_tensors='pt') + tokens = batch_encoding['input_ids'].to(self.device) + outputs = self.transformer(input_ids=tokens) + + z = outputs.last_hidden_state + return z + + def encode(self, text): + return self(text) + + +class FrozenCLIPEmbedder(AbstractEncoder): + """Uses the CLIP transformer encoder for text (from huggingface)""" + LAYERS = ['last', 'pooled', 'hidden'] + + def __init__(self, + version='openai/clip-vit-large-patch14', + device='cuda', + max_length=77, + freeze=True, + layer='last', + layer_idx=None): # clip-vit-base-patch32 + super().__init__() + assert layer in self.LAYERS + self.tokenizer = CLIPTokenizer.from_pretrained(version) + self.transformer = CLIPTextModel.from_pretrained(version) + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + self.layer = layer + self.layer_idx = layer_idx + if layer == 'hidden': + assert layer_idx is not None + assert 0 <= abs(layer_idx) <= 12 + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding='max_length', + return_tensors='pt') + tokens = batch_encoding['input_ids'].to(self.device) + outputs = self.transformer( + input_ids=tokens, output_hidden_states=self.layer == 'hidden') + if self.layer == 'last': + z = outputs.last_hidden_state + elif self.layer == 'pooled': + z = outputs.pooler_output[:, None, :] + else: + z = outputs.hidden_states[self.layer_idx] + return z + + def encode(self, text): + return self(text) + + +class FrozenOpenCLIPEmbedder(AbstractEncoder): + """ + Uses the OpenCLIP transformer encoder for text + """ + LAYERS = [ + # "pooled", + 'last', + 'penultimate' + ] + + def __init__(self, + arch='ViT-H-14', + version='laion2b_s32b_b79k', + device='cuda', + max_length=77, + freeze=True, + layer='last'): + super().__init__() + assert layer in self.LAYERS + model, _, _ = open_clip.create_model_and_transforms( + arch, device=torch.device('cpu'), pretrained=version) + del model.visual + self.model = model + + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + self.layer = layer + if self.layer == 'last': + self.layer_idx = 0 + elif self.layer == 'penultimate': + self.layer_idx = 1 + else: + raise NotImplementedError() + + def freeze(self): + self.model = self.model.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + tokens = open_clip.tokenize(text) + z = self.encode_with_transformer(tokens.to(self.device)) + return z + + def encode_with_transformer(self, text): + x = self.model.token_embedding(text) # [batch_size, n_ctx, d_model] + x = x + self.model.positional_embedding + x = x.permute(1, 0, 2) # NLD -> LND + x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.model.ln_final(x) + return x + + def text_transformer_forward(self, x: torch.Tensor, attn_mask=None): + for i, r in enumerate(self.model.transformer.resblocks): + if i == len(self.model.transformer.resblocks) - self.layer_idx: + break + if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting( + ): + x = checkpoint(r, x, attn_mask) + else: + x = r(x, attn_mask=attn_mask) + return x + + def encode(self, text): + return self(text) + + +class FrozenCLIPT5Encoder(AbstractEncoder): + + def __init__(self, + clip_version='openai/clip-vit-large-patch14', + t5_version='google/t5-v1_1-xl', + device='cuda', + clip_max_length=77, + t5_max_length=77): + super().__init__() + self.clip_encoder = FrozenCLIPEmbedder( + clip_version, device, max_length=clip_max_length) + self.t5_encoder = FrozenT5Embedder( + t5_version, device, max_length=t5_max_length) + print( + f'{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, ' + f'{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params.' + ) + + def encode(self, text): + return self(text) + + def forward(self, text): + clip_z = self.clip_encoder.encode(text) + t5_z = self.t5_encoder.encode(text) + return [clip_z, t5_z] + + +class FrozenOpenCLIPImageEncoder(AbstractEncoder): + """ + Uses the OpenCLIP transformer encoder for image + """ + + def __init__(self, + arch='ViT-H-14', + version='laion2b_s32b_b79k', + device='cuda', + freeze=True): + super().__init__() + model, _, preprocess = open_clip.create_model_and_transforms( + arch, device=torch.device('cpu'), pretrained=version) + del model.transformer + self.model = model + self.model.visual.output_tokens = True + self.device = device + if freeze: + self.freeze() + self.image_mean = torch.tensor( + [0.48145466, 0.4578275, + 0.40821073]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + self.image_std = torch.tensor( + [0.26862954, 0.26130258, + 0.275777]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + self.projector_token = nn.Linear(1280, 1024) + self.projector_embed = nn.Linear(1024, 1024) + + def freeze(self): + self.model.visual.eval() + for param in self.model.parameters(): + param.requires_grad = False + + def forward(self, image): + if isinstance(image, list): + image = torch.cat(image, 0) + image = (image.to(self.device) - self.image_mean.to( + self.device)) / self.image_std.to(self.device) + image_features, tokens = self.model.visual(image) + image_features = image_features.unsqueeze(1) + image_features = self.projector_embed(image_features) + tokens = self.projector_token(tokens) + hint = torch.cat([image_features, tokens], 1) + return hint + + def encode(self, image): + return self(image) + + +class FrozenDinoV2Encoder(AbstractEncoder): + """ + Uses the DINOv2 encoder for image + """ + + def __init__(self, model_path, device='cuda', freeze=True): + DINOv2_weight_path = model_path + + super().__init__() + dinov2 = hubconf.dinov2_vitg14() + state_dict = torch.load(DINOv2_weight_path) + dinov2.load_state_dict(state_dict, strict=False) + self.model = dinov2.to(device) + self.device = device + if freeze: + self.freeze() + self.image_mean = torch.tensor( + [0.485, 0.456, 0.406]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + self.image_std = torch.tensor( + [0.229, 0.224, 0.225]).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + self.projector = nn.Linear(1536, 1024) + + def freeze(self): + self.model.eval() + for param in self.model.parameters(): + param.requires_grad = False + + def forward(self, image): + if isinstance(image, list): + image = torch.cat(image, 0) + + image = (image.to(self.device) - self.image_mean.to( + self.device)) / self.image_std.to(self.device) + features = self.model.forward_features(image) + tokens = features['x_norm_patchtokens'] + image_features = features['x_norm_clstoken'] + image_features = image_features.unsqueeze(1) + hint = torch.cat([image_features, tokens], 1) # 8,257,1024 + hint = self.projector(hint) + return hint + + def encode(self, image): + return self(image) diff --git a/modelscope/models/cv/anydoor/ldm/util.py b/modelscope/models/cv/anydoor/ldm/util.py new file mode 100644 index 000000000..0a0c69e74 --- /dev/null +++ b/modelscope/models/cv/anydoor/ldm/util.py @@ -0,0 +1,221 @@ +import importlib +from inspect import isfunction + +import numpy as np +import torch +from PIL import Image, ImageDraw, ImageFont +from torch import optim + + +def log_txt_as_img(wh, xc, size=10): + # wh a tuple of (width, height) + # xc a list of captions to plot + b = len(xc) + txts = list() + for bi in range(b): + txt = Image.new('RGB', wh, color='white') + draw = ImageDraw.Draw(txt) + font = ImageFont.truetype('font/DejaVuSans.ttf', size=size) + nc = int(40 * (wh[0] / 256)) + lines = '\n'.join(xc[bi][start:start + nc] + for start in range(0, len(xc[bi]), nc)) + + try: + draw.text((0, 0), lines, fill='black', font=font) + except UnicodeEncodeError: + print('Cant encode string for logging. Skipping.') + + txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0 + txts.append(txt) + txts = np.stack(txts) + txts = torch.tensor(txts) + return txts + + +def ismap(x): + if not isinstance(x, torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] > 3) + + +def isimage(x): + if not isinstance(x, torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def mean_flat(tensor): + """ + https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86 + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def count_params(model, verbose=False): + total_params = sum(p.numel() for p in model.parameters()) + if verbose: + print( + f'{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.' + ) + return total_params + + +def instantiate_from_config(config): + if 'target' not in config: + if config == '__is_first_stage__': + return None + elif config == '__is_unconditional__': + return None + raise KeyError('Expected key `target` to instantiate.') + return get_obj_from_str(config['target'])(**config.get('params', dict())) + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit('.', 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +class AdamWwithEMAandWings(optim.Optimizer): + # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298 + def __init__(self, + params, + lr=1.e-3, + betas=(0.9, 0.999), + eps=1.e-8, + weight_decay=1.e-2, + amsgrad=False, + ema_decay=0.9999, + ema_power=1., + param_names=()): + """AdamW that saves EMA versions of the parameters.""" + if not 0.0 <= lr: + raise ValueError('Invalid learning rate: {}'.format(lr)) + if not 0.0 <= eps: + raise ValueError('Invalid epsilon value: {}'.format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError('Invalid beta parameter at index 0: {}'.format( + betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError('Invalid beta parameter at index 1: {}'.format( + betas[1])) + if not 0.0 <= weight_decay: + raise ValueError( + 'Invalid weight_decay value: {}'.format(weight_decay)) + if not 0.0 <= ema_decay <= 1.0: + raise ValueError('Invalid ema_decay value: {}'.format(ema_decay)) + defaults = dict( + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + amsgrad=amsgrad, + ema_decay=ema_decay, + ema_power=ema_power, + param_names=param_names) + super().__init__(params, defaults) + + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + ema_params_with_grad = [] + max_exp_avg_sqs = [] + state_steps = [] + amsgrad = group['amsgrad'] + beta1, beta2 = group['betas'] + ema_decay = group['ema_decay'] + ema_power = group['ema_power'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + if p.grad.is_sparse: + raise RuntimeError( + 'AdamW does not support sparse gradients') + grads.append(p.grad) + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like( + p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like( + p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like( + p, memory_format=torch.preserve_format) + # Exponential moving average of parameter values + state['param_exp_avg'] = p.detach().float().clone() + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + ema_params_with_grad.append(state['param_exp_avg']) + + if amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + # update the steps for each param group update + state['step'] += 1 + # record the step after step update + state_steps.append(state['step']) + + optim._functional.adamw( + params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps'], + maximize=False) + + cur_ema_decay = min(ema_decay, 1 - state['step']**-ema_power) + for param, ema_param in zip(params_with_grad, + ema_params_with_grad): + ema_param.mul_(cur_ema_decay).add_( + param.float(), alpha=1 - cur_ema_decay) + + return loss diff --git a/modelscope/models/cv/body_3d_keypoints/__init__.py b/modelscope/models/cv/body_3d_keypoints/__init__.py index 2672ba9a8..1c08aa247 100644 --- a/modelscope/models/cv/body_3d_keypoints/__init__.py +++ b/modelscope/models/cv/body_3d_keypoints/__init__.py @@ -4,11 +4,11 @@ from modelscope.utils.import_utils import LazyImportModule if TYPE_CHECKING: - from .cannonical_pose import BodyKeypointsDetection3D + from .canonical_pose import BodyKeypointsDetection3D from .hdformer import HDFormerDetector else: _import_structure = { - 'cannonical_pose': ['BodyKeypointsDetection3D'], + 'canonical_pose': ['BodyKeypointsDetection3D'], 'hdformer': ['HDFormerDetector'], } diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose/__init__.py similarity index 100% rename from modelscope/models/cv/body_3d_keypoints/cannonical_pose/__init__.py rename to modelscope/models/cv/body_3d_keypoints/canonical_pose/__init__.py diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose/body_3d_pose.py similarity index 95% rename from modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py rename to modelscope/models/cv/body_3d_keypoints/canonical_pose/body_3d_pose.py index e9c083950..57159f0cc 100644 --- a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/body_3d_pose.py +++ b/modelscope/models/cv/body_3d_keypoints/canonical_pose/body_3d_pose.py @@ -10,7 +10,7 @@ from modelscope.metainfo import Models from modelscope.models.base.base_torch_model import TorchModel from modelscope.models.builder import MODELS -from modelscope.models.cv.body_3d_keypoints.cannonical_pose.canonical_pose_modules import ( +from modelscope.models.cv.body_3d_keypoints.canonical_pose.canonical_pose_modules import ( TemporalModel, TransCan3Dkeys) from modelscope.utils.config import Config from modelscope.utils.constant import ModelFile, Tasks @@ -218,17 +218,17 @@ def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr, w = input_video_frame_num - pad * 2 lst_pose2d_rr = [] - lst_pose2d_cannoical = [] + lst_pose2d_canonical = [] for i in range(pad, w + pad): lst_pose2d_rr.append(pose2d_rr[:, i - pad:i + pad + 1]) - lst_pose2d_cannoical.append(pose2d_canonical[:, + lst_pose2d_canonical.append(pose2d_canonical[:, i - pad:i + pad + 1]) - input_pose2d_rr = torch.cat(lst_pose2d_cannoical, axis=0) - input_pose2d_cannoical = torch.cat(lst_pose2d_cannoical, axis=0) + input_pose2d_rr = torch.cat(lst_pose2d_canonical, axis=0) + input_pose2d_canonical = torch.cat(lst_pose2d_canonical, axis=0) if self.cfg.model.MODEL.USE_CANONICAL_COORDS: - input_pose2d_abs = input_pose2d_cannoical.clone() + input_pose2d_abs = input_pose2d_canonical.clone() else: input_pose2d_abs = input_pose2d_rr.clone() input_pose2d_abs[:, :, 1:] += input_pose2d_abs[:, :, :1] @@ -238,8 +238,8 @@ def get_abs_2d_pts(self, input_video_frame_num, pose2d_rr, def canonicalize_2Ds(self, pos2d, f, c): cs = np.array([c[0], c[1]]).reshape(1, 1, 2) fs = np.array([f[0], f[1]]).reshape(1, 1, 2) - canoical_2Ds = (pos2d - cs) / fs - return canoical_2Ds + canonical_2Ds = (pos2d - cs) / fs + return canonical_2Ds def normalize_screen_coordinates(self, X, w, h): assert X.shape[-1] == 2 diff --git a/modelscope/models/cv/body_3d_keypoints/cannonical_pose/canonical_pose_modules.py b/modelscope/models/cv/body_3d_keypoints/canonical_pose/canonical_pose_modules.py similarity index 100% rename from modelscope/models/cv/body_3d_keypoints/cannonical_pose/canonical_pose_modules.py rename to modelscope/models/cv/body_3d_keypoints/canonical_pose/canonical_pose_modules.py diff --git a/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py index 73c9b4be3..135d5f50e 100644 --- a/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py +++ b/modelscope/models/cv/body_3d_keypoints/hdformer/hdformer_detector.py @@ -58,7 +58,7 @@ def load_model(self, load_to_cpu=False): self.net.eval() def preprocess(self, input: Dict[str, Any]) -> Dict[str, Any]: - """Proprocess of 2D input joints. + """Preprocess of 2D input joints. Args: input (Dict[str, Any]): [NUM_FRAME, NUM_JOINTS, 2], input 2d human body keypoints. diff --git a/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py b/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py index 078cc2ec8..75c65ef49 100644 --- a/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py +++ b/modelscope/models/cv/controllable_image_generation/annotator/midas/midas/transforms.py @@ -7,7 +7,7 @@ def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): - """Rezise the sample to ensure the given size. Keeps aspect ratio. + """Resize the sample to ensure the given size. Keeps aspect ratio. Args: sample (dict): sample @@ -133,7 +133,7 @@ def get_size(self, width, height): # fit height scale_width = scale_height elif self.__resize_method == 'minimal': - # scale as least as possbile + # scale as least as possible if abs(1 - scale_width) < abs(1 - scale_height): # fit width scale_height = scale_width @@ -198,7 +198,7 @@ def __call__(self, sample): class NormalizeImage(object): - """Normlize image by given mean and std. + """Normalize image by given mean and std. """ def __init__(self, mean, std): diff --git a/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py b/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py index d348d1542..1a5f3c589 100644 --- a/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py +++ b/modelscope/models/cv/controllable_image_generation/annotator/mlsd/utils.py @@ -13,7 +13,7 @@ from torch.nn import functional as F -def deccode_output_score_and_ptss(tpMap, topk_n=200, ksize=5): +def decode_output_score_and_ptss(tpMap, topk_n=200, ksize=5): ''' tpMap: center: tpMap[1, 0, :, :] @@ -61,7 +61,7 @@ def pred_lines(image, batch_image = torch.from_numpy(batch_image).float().cuda() outputs = model(batch_image) - pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) + pts, pts_score, vmap = decode_output_score_and_ptss(outputs, 200, 3) start = vmap[:, :, :2] end = vmap[:, :, 2:] dist_map = np.sqrt(np.sum((start - end)**2, axis=-1)) @@ -116,7 +116,7 @@ def pred_squares(image, model, input_shape=[512, 512], params=params_glob): batch_image = torch.from_numpy(batch_image).float().cuda() outputs = model(batch_image) - pts, pts_score, vmap = deccode_output_score_and_ptss(outputs, 200, 3) + pts, pts_score, vmap = decode_output_score_and_ptss(outputs, 200, 3) start = vmap[:, :, :2] # (x, y) end = vmap[:, :, 2:] # (x, y) dist_map = np.sqrt(np.sum((start - end)**2, axis=-1)) @@ -268,7 +268,7 @@ def pred_squares(image, model, input_shape=[512, 512], params=params_glob): | dist(inter,0), dist(inter,0), dist(inter,0), ... | | dist(inter,1), dist(inter,1), dist(inter,1), ... | ... - dist_inter_to_semgnet2: + dist_inter_to_segment2: | dist(inter,0), dist(inter,1), dist(inter,2), ... | | dist(inter,0), dist(inter,1), dist(inter,2), ... | ... diff --git a/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py b/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py index 11e33c2fe..d1fb09fcf 100644 --- a/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py +++ b/modelscope/models/cv/controllable_image_generation/annotator/openpose/body.py @@ -130,7 +130,7 @@ def __call__(self, oriImg): limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9], [9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1], [1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]] - # the middle joints heatmap correpondence + # the middle joints heatmap correspondence mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44], [19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30], [47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38], diff --git a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py index 64f40da06..af050b75e 100644 --- a/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py +++ b/modelscope/models/cv/crowd_counting/hrnet_aspp_relu.py @@ -556,10 +556,10 @@ def forward(self, x): x = x + F.relu_(aspp_out[i] * 0.25) * pred_attn_list[i] bz = x.size(0) - # -- Besides, we also need to let the prediction attention be close to visable domain + # -- Besides, we also need to let the prediction attention be close to visible domain # -- Calculate the domain distance and get the weights # - First, detach domains - G_all_d = self.G_all.detach() # use detached G_all for calulcating + G_all_d = self.G_all.detach() # use detached G_all for calculating pred_attn_d = pred_attn.detach().view(bz, 512, 1, 1) if self.cosine == 1: diff --git a/modelscope/models/cv/dense_optical_flow_estimation/__init__.py b/modelscope/models/cv/dense_optical_flow_estimation/__init__.py new file mode 100644 index 000000000..be8fc28ed --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/__init__.py @@ -0,0 +1,21 @@ +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .raft_model import DenseOpticalFlowEstimation + +else: + _import_structure = { + 'raft_dense_optical_flow_estimation': ['DenseOpticalFlowEstimation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/__init__.py b/modelscope/models/cv/dense_optical_flow_estimation/core/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/corr.py b/modelscope/models/cv/dense_optical_flow_estimation/core/corr.py new file mode 100644 index 000000000..a0b1a27e4 --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/corr.py @@ -0,0 +1,95 @@ +import torch +import torch.nn.functional as F + +from modelscope.models.cv.dense_optical_flow_estimation.core.utils.utils import ( + bilinear_sampler, coords_grid) + +try: + import alt_cuda_corr +except ModuleNotFoundError: + # alt_cuda_corr is not compiled + pass + + +class CorrBlock: + + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + self.corr_pyramid = [] + + # all pairs correlation + corr = CorrBlock.corr(fmap1, fmap2) + + batch, h1, w1, dim, h2, w2 = corr.shape + corr = corr.reshape(batch * h1 * w1, dim, h2, w2) + + self.corr_pyramid.append(corr) + for i in range(self.num_levels - 1): + corr = F.avg_pool2d(corr, 2, stride=2) + self.corr_pyramid.append(corr) + + def __call__(self, coords): + r = self.radius + coords = coords.permute(0, 2, 3, 1) + batch, h1, w1, _ = coords.shape + + out_pyramid = [] + for i in range(self.num_levels): + corr = self.corr_pyramid[i] + dx = torch.linspace(-r, r, 2 * r + 1, device=coords.device) + dy = torch.linspace(-r, r, 2 * r + 1, device=coords.device) + delta = torch.stack(torch.meshgrid(dy, dx), axis=-1) + + centroid_lvl = coords.reshape(batch * h1 * w1, 1, 1, 2) / 2**i + delta_lvl = delta.view(1, 2 * r + 1, 2 * r + 1, 2) + coords_lvl = centroid_lvl + delta_lvl + + corr = bilinear_sampler(corr, coords_lvl) + corr = corr.view(batch, h1, w1, -1) + out_pyramid.append(corr) + + out = torch.cat(out_pyramid, dim=-1) + return out.permute(0, 3, 1, 2).contiguous().float() + + @staticmethod + def corr(fmap1, fmap2): + batch, dim, ht, wd = fmap1.shape + fmap1 = fmap1.view(batch, dim, ht * wd) + fmap2 = fmap2.view(batch, dim, ht * wd) + + corr = torch.matmul(fmap1.transpose(1, 2), fmap2) + corr = corr.view(batch, ht, wd, 1, ht, wd) + return corr / torch.sqrt(torch.tensor(dim).float()) + + +class AlternateCorrBlock: + + def __init__(self, fmap1, fmap2, num_levels=4, radius=4): + self.num_levels = num_levels + self.radius = radius + + self.pyramid = [(fmap1, fmap2)] + for i in range(self.num_levels): + fmap1 = F.avg_pool2d(fmap1, 2, stride=2) + fmap2 = F.avg_pool2d(fmap2, 2, stride=2) + self.pyramid.append((fmap1, fmap2)) + + def __call__(self, coords): + coords = coords.permute(0, 2, 3, 1) + B, H, W, _ = coords.shape + dim = self.pyramid[0][0].shape[1] + + corr_list = [] + for i in range(self.num_levels): + r = self.radius + fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous() + fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous() + + coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous() + corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r) + corr_list.append(corr.squeeze(1)) + + corr = torch.stack(corr_list, dim=1) + corr = corr.reshape(B, -1, H, W) + return corr / torch.sqrt(torch.tensor(dim).float()) diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/datasets.py b/modelscope/models/cv/dense_optical_flow_estimation/core/datasets.py new file mode 100644 index 000000000..eb8a85593 --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/datasets.py @@ -0,0 +1,297 @@ +# Data loading based on https://github.com/NVIDIA/flownet2-pytorch + +import math +import os +import os.path as osp +import random +from glob import glob + +import numpy as np +import torch +import torch.nn.functional as F +import torch.utils.data as data +from utils import frame_utils +from utils.augmentor import FlowAugmentor, SparseFlowAugmentor + + +class FlowDataset(data.Dataset): + + def __init__(self, aug_params=None, sparse=False): + self.augmentor = None + self.sparse = sparse + if aug_params is not None: + if sparse: + self.augmentor = SparseFlowAugmentor(**aug_params) + else: + self.augmentor = FlowAugmentor(**aug_params) + + self.is_test = False + self.init_seed = False + self.flow_list = [] + self.image_list = [] + self.extra_info = [] + + def __getitem__(self, index): + + if self.is_test: + img1 = frame_utils.read_gen(self.image_list[index][0]) + img2 = frame_utils.read_gen(self.image_list[index][1]) + img1 = np.array(img1).astype(np.uint8)[..., :3] + img2 = np.array(img2).astype(np.uint8)[..., :3] + img1 = torch.from_numpy(img1).permute(2, 0, 1).float() + img2 = torch.from_numpy(img2).permute(2, 0, 1).float() + return img1, img2, self.extra_info[index] + + if not self.init_seed: + worker_info = torch.utils.data.get_worker_info() + if worker_info is not None: + torch.manual_seed(worker_info.id) + np.random.seed(worker_info.id) + random.seed(worker_info.id) + self.init_seed = True + + index = index % len(self.image_list) + valid = None + if self.sparse: + flow, valid = frame_utils.readFlowKITTI(self.flow_list[index]) + else: + flow = frame_utils.read_gen(self.flow_list[index]) + + img1 = frame_utils.read_gen(self.image_list[index][0]) + img2 = frame_utils.read_gen(self.image_list[index][1]) + + flow = np.array(flow).astype(np.float32) + img1 = np.array(img1).astype(np.uint8) + img2 = np.array(img2).astype(np.uint8) + + # grayscale images + if len(img1.shape) == 2: + img1 = np.tile(img1[..., None], (1, 1, 3)) + img2 = np.tile(img2[..., None], (1, 1, 3)) + else: + img1 = img1[..., :3] + img2 = img2[..., :3] + + if self.augmentor is not None: + if self.sparse: + img1, img2, flow, valid = self.augmentor( + img1, img2, flow, valid) + else: + img1, img2, flow = self.augmentor(img1, img2, flow) + + img1 = torch.from_numpy(img1).permute(2, 0, 1).float() + img2 = torch.from_numpy(img2).permute(2, 0, 1).float() + flow = torch.from_numpy(flow).permute(2, 0, 1).float() + + if valid is not None: + valid = torch.from_numpy(valid) + else: + valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000) + + return img1, img2, flow, valid.float() + + def __rmul__(self, v): + self.flow_list = v * self.flow_list + self.image_list = v * self.image_list + return self + + def __len__(self): + return len(self.image_list) + + +class MpiSintel(FlowDataset): + + def __init__(self, + aug_params=None, + split='training', + root='datasets/Sintel', + dstype='clean'): + super(MpiSintel, self).__init__(aug_params) + flow_root = osp.join(root, split, 'flow') + image_root = osp.join(root, split, dstype) + + if split == 'test': + self.is_test = True + + for scene in os.listdir(image_root): + image_list = sorted(glob(osp.join(image_root, scene, '*.png'))) + for i in range(len(image_list) - 1): + self.image_list += [[image_list[i], image_list[i + 1]]] + self.extra_info += [(scene, i)] # scene and frame_id + + if split != 'test': + self.flow_list += sorted( + glob(osp.join(flow_root, scene, '*.flo'))) + + +class FlyingChairs(FlowDataset): + + def __init__(self, + aug_params=None, + split='train', + root='datasets/FlyingChairs_release/data'): + super(FlyingChairs, self).__init__(aug_params) + + images = sorted(glob(osp.join(root, '*.ppm'))) + flows = sorted(glob(osp.join(root, '*.flo'))) + assert (len(images) // 2 == len(flows)) + + split_list = np.loadtxt('chairs_split.txt', dtype=np.int32) + for i in range(len(flows)): + xid = split_list[i] + if (split == 'training' and xid == 1) or (split == 'validation' + and xid == 2): + self.flow_list += [flows[i]] + self.image_list += [[images[2 * i], images[2 * i + 1]]] + + +class FlyingThings3D(FlowDataset): + + def __init__(self, + aug_params=None, + root='datasets/FlyingThings3D', + dstype='frames_cleanpass'): + super(FlyingThings3D, self).__init__(aug_params) + + for cam in ['left']: + for direction in ['into_future', 'into_past']: + image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*'))) + image_dirs = sorted([osp.join(f, cam) for f in image_dirs]) + + flow_dirs = sorted( + glob(osp.join(root, 'optical_flow/TRAIN/*/*'))) + flow_dirs = sorted( + [osp.join(f, direction, cam) for f in flow_dirs]) + + for idir, fdir in zip(image_dirs, flow_dirs): + images = sorted(glob(osp.join(idir, '*.png'))) + flows = sorted(glob(osp.join(fdir, '*.pfm'))) + for i in range(len(flows) - 1): + if direction == 'into_future': + self.image_list += [[images[i], images[i + 1]]] + self.flow_list += [flows[i]] + elif direction == 'into_past': + self.image_list += [[images[i + 1], images[i]]] + self.flow_list += [flows[i + 1]] + + +class KITTI(FlowDataset): + + def __init__(self, + aug_params=None, + split='training', + root='datasets/KITTI'): + super(KITTI, self).__init__(aug_params, sparse=True) + if split == 'testing': + self.is_test = True + + root = osp.join(root, split) + images1 = sorted(glob(osp.join(root, 'image_2/*_10.png'))) + images2 = sorted(glob(osp.join(root, 'image_2/*_11.png'))) + + for img1, img2 in zip(images1, images2): + frame_id = img1.split('/')[-1] + self.extra_info += [[frame_id]] + self.image_list += [[img1, img2]] + + if split == 'training': + self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png'))) + + +class HD1K(FlowDataset): + + def __init__(self, aug_params=None, root='datasets/HD1k'): + super(HD1K, self).__init__(aug_params, sparse=True) + + seq_ix = 0 + while 1: + flows = sorted( + glob( + os.path.join(root, 'hd1k_flow_gt', + 'flow_occ/%06d_*.png' % seq_ix))) + images = sorted( + glob( + os.path.join(root, 'hd1k_input', + 'image_2/%06d_*.png' % seq_ix))) + + if len(flows) == 0: + break + + for i in range(len(flows) - 1): + self.flow_list += [flows[i]] + self.image_list += [[images[i], images[i + 1]]] + + seq_ix += 1 + + +def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'): + """ Create the data loader for the corresponding trainign set """ + + if args.stage == 'chairs': + aug_params = { + 'crop_size': args.image_size, + 'min_scale': -0.1, + 'max_scale': 1.0, + 'do_flip': True + } + train_dataset = FlyingChairs(aug_params, split='training') + + elif args.stage == 'things': + aug_params = { + 'crop_size': args.image_size, + 'min_scale': -0.4, + 'max_scale': 0.8, + 'do_flip': True + } + clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass') + final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass') + train_dataset = clean_dataset + final_dataset + + elif args.stage == 'sintel': + aug_params = { + 'crop_size': args.image_size, + 'min_scale': -0.2, + 'max_scale': 0.6, + 'do_flip': True + } + things = FlyingThings3D(aug_params, dstype='frames_cleanpass') + sintel_clean = MpiSintel(aug_params, split='training', dstype='clean') + sintel_final = MpiSintel(aug_params, split='training', dstype='final') + + if TRAIN_DS == 'C+T+K+S+H': + kitti = KITTI({ + 'crop_size': args.image_size, + 'min_scale': -0.3, + 'max_scale': 0.5, + 'do_flip': True + }) + hd1k = HD1K({ + 'crop_size': args.image_size, + 'min_scale': -0.5, + 'max_scale': 0.2, + 'do_flip': True + }) + train_dataset = 100 * sintel_clean + 100 * sintel_final + 200 * kitti + 5 * hd1k + things + + elif TRAIN_DS == 'C+T+K/S': + train_dataset = 100 * sintel_clean + 100 * sintel_final + things + + elif args.stage == 'kitti': + aug_params = { + 'crop_size': args.image_size, + 'min_scale': -0.2, + 'max_scale': 0.4, + 'do_flip': False + } + train_dataset = KITTI(aug_params, split='training') + + train_loader = data.DataLoader( + train_dataset, + batch_size=args.batch_size, + pin_memory=False, + shuffle=True, + num_workers=4, + drop_last=True) + + print('Training with %d image pairs' % len(train_dataset)) + return train_loader diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/extractor.py b/modelscope/models/cv/dense_optical_flow_estimation/core/extractor.py new file mode 100644 index 000000000..dfa8e4de9 --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/extractor.py @@ -0,0 +1,285 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ResidualBlock(nn.Module): + + def __init__(self, in_planes, planes, norm_fn='group', stride=1): + super(ResidualBlock, self).__init__() + + self.conv1 = nn.Conv2d( + in_planes, planes, kernel_size=3, padding=1, stride=stride) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == 'group': + self.norm1 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes) + self.norm2 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes) + if not stride == 1: + self.norm3 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes) + + elif norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(planes) + self.norm2 = nn.BatchNorm2d(planes) + if not stride == 1: + self.norm3 = nn.BatchNorm2d(planes) + + elif norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(planes) + self.norm2 = nn.InstanceNorm2d(planes) + if not stride == 1: + self.norm3 = nn.InstanceNorm2d(planes) + + elif norm_fn == 'none': + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + if not stride == 1: + self.norm3 = nn.Sequential() + + if stride == 1: + self.downsample = None + + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), + self.norm3) + + def forward(self, x): + y = x + y = self.relu(self.norm1(self.conv1(y))) + y = self.relu(self.norm2(self.conv2(y))) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x + y) + + +class BottleneckBlock(nn.Module): + + def __init__(self, in_planes, planes, norm_fn='group', stride=1): + super(BottleneckBlock, self).__init__() + + self.conv1 = nn.Conv2d( + in_planes, planes // 4, kernel_size=1, padding=0) + self.conv2 = nn.Conv2d( + planes // 4, planes // 4, kernel_size=3, padding=1, stride=stride) + self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0) + self.relu = nn.ReLU(inplace=True) + + num_groups = planes // 8 + + if norm_fn == 'group': + self.norm1 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes // 4) + self.norm2 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes // 4) + self.norm3 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes) + if not stride == 1: + self.norm4 = nn.GroupNorm( + num_groups=num_groups, num_channels=planes) + + elif norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(planes // 4) + self.norm2 = nn.BatchNorm2d(planes // 4) + self.norm3 = nn.BatchNorm2d(planes) + if not stride == 1: + self.norm4 = nn.BatchNorm2d(planes) + + elif norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(planes // 4) + self.norm2 = nn.InstanceNorm2d(planes // 4) + self.norm3 = nn.InstanceNorm2d(planes) + if not stride == 1: + self.norm4 = nn.InstanceNorm2d(planes) + + elif norm_fn == 'none': + self.norm1 = nn.Sequential() + self.norm2 = nn.Sequential() + self.norm3 = nn.Sequential() + if not stride == 1: + self.norm4 = nn.Sequential() + + if stride == 1: + self.downsample = None + + else: + self.downsample = nn.Sequential( + nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), + self.norm4) + + def forward(self, x): + y = x + y = self.relu(self.norm1(self.conv1(y))) + y = self.relu(self.norm2(self.conv2(y))) + y = self.relu(self.norm3(self.conv3(y))) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x + y) + + +class BasicEncoder(nn.Module): + + def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): + super(BasicEncoder, self).__init__() + self.norm_fn = norm_fn + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64) + + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(64) + + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(64) + + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3) + self.relu1 = nn.ReLU(inplace=True) + + self.in_planes = 64 + self.layer1 = self._make_layer(64, stride=1) + self.layer2 = self._make_layer(96, stride=2) + self.layer3 = self._make_layer(128, stride=2) + + # output convolution + self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1) + + self.dropout = None + if dropout > 0: + self.dropout = nn.Dropout2d(p=dropout) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, + (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = ResidualBlock( + self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, x): + + # if input is list, combine batch dimension + is_list = isinstance(x, tuple) or isinstance(x, list) + if is_list: + batch_dim = x[0].shape[0] + x = torch.cat(x, dim=0) + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + + x = self.conv2(x) + + if self.training and self.dropout is not None: + x = self.dropout(x) + + if is_list: + x = torch.split(x, [batch_dim, batch_dim], dim=0) + + return x + + +class SmallEncoder(nn.Module): + + def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0): + super(SmallEncoder, self).__init__() + self.norm_fn = norm_fn + + if self.norm_fn == 'group': + self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32) + + elif self.norm_fn == 'batch': + self.norm1 = nn.BatchNorm2d(32) + + elif self.norm_fn == 'instance': + self.norm1 = nn.InstanceNorm2d(32) + + elif self.norm_fn == 'none': + self.norm1 = nn.Sequential() + + self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3) + self.relu1 = nn.ReLU(inplace=True) + + self.in_planes = 32 + self.layer1 = self._make_layer(32, stride=1) + self.layer2 = self._make_layer(64, stride=2) + self.layer3 = self._make_layer(96, stride=2) + + self.dropout = None + if dropout > 0: + self.dropout = nn.Dropout2d(p=dropout) + + self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, + (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)): + if m.weight is not None: + nn.init.constant_(m.weight, 1) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + + def _make_layer(self, dim, stride=1): + layer1 = BottleneckBlock( + self.in_planes, dim, self.norm_fn, stride=stride) + layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, x): + + # if input is list, combine batch dimension + is_list = isinstance(x, tuple) or isinstance(x, list) + if is_list: + batch_dim = x[0].shape[0] + x = torch.cat(x, dim=0) + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu1(x) + + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.conv2(x) + + if self.training and self.dropout is not None: + x = self.dropout(x) + + if is_list: + x = torch.split(x, [batch_dim, batch_dim], dim=0) + + return x diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/raft.py b/modelscope/models/cv/dense_optical_flow_estimation/core/raft.py new file mode 100644 index 000000000..f2b801bc4 --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/raft.py @@ -0,0 +1,163 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.cv.dense_optical_flow_estimation.core.corr import ( + AlternateCorrBlock, CorrBlock) +from modelscope.models.cv.dense_optical_flow_estimation.core.extractor import ( + BasicEncoder, SmallEncoder) +from modelscope.models.cv.dense_optical_flow_estimation.core.update import ( + BasicUpdateBlock, SmallUpdateBlock) +from modelscope.models.cv.dense_optical_flow_estimation.core.utils.utils import ( + bilinear_sampler, coords_grid, upflow8) + +autocast = torch.cuda.amp.autocast + +# try: +# autocast = torch.cuda.amp.autocast +# except: +# # dummy autocast for PyTorch < 1.6 +# class autocast: +# def __init__(self, enabled): +# pass +# def __enter__(self): +# pass +# def __exit__(self, *args): +# pass + + +class RAFT(TorchModel): + + def __init__(self, args): + super(RAFT, self).__init__() + self.args = args + + if args.small: + self.hidden_dim = hdim = 96 + self.context_dim = cdim = 64 + args.corr_levels = 4 + args.corr_radius = 3 + + else: + self.hidden_dim = hdim = 128 + self.context_dim = cdim = 128 + args.corr_levels = 4 + args.corr_radius = 4 + + if 'dropout' not in self.args: + self.args.dropout = 0 + + if 'alternate_corr' not in self.args: + self.args.alternate_corr = False + + # feature network, context network, and update block + if args.small: + self.fnet = SmallEncoder( + output_dim=128, norm_fn='instance', dropout=args.dropout) + self.cnet = SmallEncoder( + output_dim=hdim + cdim, norm_fn='none', dropout=args.dropout) + self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim) + + else: + self.fnet = BasicEncoder( + output_dim=256, norm_fn='instance', dropout=args.dropout) + self.cnet = BasicEncoder( + output_dim=hdim + cdim, norm_fn='batch', dropout=args.dropout) + self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim) + + def freeze_bn(self): + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + + def initialize_flow(self, img): + """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0""" + N, C, H, W = img.shape + coords0 = coords_grid(N, H // 8, W // 8, device=img.device) + coords1 = coords_grid(N, H // 8, W // 8, device=img.device) + + # optical flow computed as difference: flow = coords1 - coords0 + return coords0, coords1 + + def upsample_flow(self, flow, mask): + """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """ + N, _, H, W = flow.shape + mask = mask.view(N, 1, 9, 8, 8, H, W) + mask = torch.softmax(mask, dim=2) + + up_flow = F.unfold(8 * flow, [3, 3], padding=1) + up_flow = up_flow.view(N, 2, 9, 1, 1, H, W) + + up_flow = torch.sum(mask * up_flow, dim=2) + up_flow = up_flow.permute(0, 1, 4, 2, 5, 3) + return up_flow.reshape(N, 2, 8 * H, 8 * W) + + def forward(self, + image1, + image2, + iters=20, + flow_init=None, + upsample=True, + test_mode=False): + """ Estimate optical flow between pair of frames """ + + image1 = 2 * (image1 / 255.0) - 1.0 + image2 = 2 * (image2 / 255.0) - 1.0 + + image1 = image1.contiguous() + image2 = image2.contiguous() + + hdim = self.hidden_dim + cdim = self.context_dim + + # run the feature network + with autocast(enabled=self.args.mixed_precision): + fmap1, fmap2 = self.fnet([image1, image2]) + + fmap1 = fmap1.float() + fmap2 = fmap2.float() + if self.args.alternate_corr: + corr_fn = AlternateCorrBlock( + fmap1, fmap2, radius=self.args.corr_radius) + else: + corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius) + + # run the context network + with autocast(enabled=self.args.mixed_precision): + cnet = self.cnet(image1) + net, inp = torch.split(cnet, [hdim, cdim], dim=1) + net = torch.tanh(net) + inp = torch.relu(inp) + + coords0, coords1 = self.initialize_flow(image1) + + if flow_init is not None: + coords1 = coords1 + flow_init + + flow_predictions = [] + for itr in range(iters): + coords1 = coords1.detach() + corr = corr_fn(coords1) # index correlation volume + + flow = coords1 - coords0 + with autocast(enabled=self.args.mixed_precision): + net, up_mask, delta_flow = self.update_block( + net, inp, corr, flow) + + # F(t+1) = F(t) + \Delta(t) + coords1 = coords1 + delta_flow + + # upsample predictions + if up_mask is None: + flow_up = upflow8(coords1 - coords0) + else: + flow_up = self.upsample_flow(coords1 - coords0, up_mask) + + flow_predictions.append(flow_up) + + if test_mode: + return coords1 - coords0, flow_up + + return flow_predictions diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/update.py b/modelscope/models/cv/dense_optical_flow_estimation/core/update.py new file mode 100644 index 000000000..b43bb0ecd --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/update.py @@ -0,0 +1,157 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class FlowHead(nn.Module): + + def __init__(self, input_dim=128, hidden_dim=256): + super(FlowHead, self).__init__() + self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1) + self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + return self.conv2(self.relu(self.conv1(x))) + + +class ConvGRU(nn.Module): + + def __init__(self, hidden_dim=128, input_dim=192 + 128): + super(ConvGRU, self).__init__() + self.convz = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, 3, padding=1) + self.convr = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, 3, padding=1) + self.convq = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, 3, padding=1) + + def forward(self, h, x): + hx = torch.cat([h, x], dim=1) + + z = torch.sigmoid(self.convz(hx)) + r = torch.sigmoid(self.convr(hx)) + q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1))) + + h = (1 - z) * h + z * q + return h + + +class SepConvGRU(nn.Module): + + def __init__(self, hidden_dim=128, input_dim=192 + 128): + super(SepConvGRU, self).__init__() + self.convz1 = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)) + self.convr1 = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)) + self.convq1 = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, (1, 5), padding=(0, 2)) + + self.convz2 = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)) + self.convr2 = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)) + self.convq2 = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, (5, 1), padding=(2, 0)) + + def forward(self, h, x): + # horizontal + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz1(hx)) + r = torch.sigmoid(self.convr1(hx)) + q = torch.tanh(self.convq1(torch.cat([r * h, x], dim=1))) + h = (1 - z) * h + z * q + + # vertical + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz2(hx)) + r = torch.sigmoid(self.convr2(hx)) + q = torch.tanh(self.convq2(torch.cat([r * h, x], dim=1))) + h = (1 - z) * h + z * q + + return h + + +class SmallMotionEncoder(nn.Module): + + def __init__(self, args): + super(SmallMotionEncoder, self).__init__() + cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2 + self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0) + self.convf1 = nn.Conv2d(2, 64, 7, padding=3) + self.convf2 = nn.Conv2d(64, 32, 3, padding=1) + self.conv = nn.Conv2d(128, 80, 3, padding=1) + + def forward(self, flow, corr): + cor = F.relu(self.convc1(corr)) + flo = F.relu(self.convf1(flow)) + flo = F.relu(self.convf2(flo)) + cor_flo = torch.cat([cor, flo], dim=1) + out = F.relu(self.conv(cor_flo)) + return torch.cat([out, flow], dim=1) + + +class BasicMotionEncoder(nn.Module): + + def __init__(self, args): + super(BasicMotionEncoder, self).__init__() + cor_planes = args.corr_levels * (2 * args.corr_radius + 1)**2 + self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0) + self.convc2 = nn.Conv2d(256, 192, 3, padding=1) + self.convf1 = nn.Conv2d(2, 128, 7, padding=3) + self.convf2 = nn.Conv2d(128, 64, 3, padding=1) + self.conv = nn.Conv2d(64 + 192, 128 - 2, 3, padding=1) + + def forward(self, flow, corr): + cor = F.relu(self.convc1(corr)) + cor = F.relu(self.convc2(cor)) + flo = F.relu(self.convf1(flow)) + flo = F.relu(self.convf2(flo)) + + cor_flo = torch.cat([cor, flo], dim=1) + out = F.relu(self.conv(cor_flo)) + return torch.cat([out, flow], dim=1) + + +class SmallUpdateBlock(nn.Module): + + def __init__(self, args, hidden_dim=96): + super(SmallUpdateBlock, self).__init__() + self.encoder = SmallMotionEncoder(args) + self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82 + 64) + self.flow_head = FlowHead(hidden_dim, hidden_dim=128) + + def forward(self, net, inp, corr, flow): + motion_features = self.encoder(flow, corr) + inp = torch.cat([inp, motion_features], dim=1) + net = self.gru(net, inp) + delta_flow = self.flow_head(net) + + return net, None, delta_flow + + +class BasicUpdateBlock(nn.Module): + + def __init__(self, args, hidden_dim=128, input_dim=128): + super(BasicUpdateBlock, self).__init__() + self.args = args + self.encoder = BasicMotionEncoder(args) + self.gru = SepConvGRU( + hidden_dim=hidden_dim, input_dim=128 + hidden_dim) + self.flow_head = FlowHead(hidden_dim, hidden_dim=256) + + self.mask = nn.Sequential( + nn.Conv2d(128, 256, 3, padding=1), nn.ReLU(inplace=True), + nn.Conv2d(256, 64 * 9, 1, padding=0)) + + def forward(self, net, inp, corr, flow, upsample=True): + motion_features = self.encoder(flow, corr) + inp = torch.cat([inp, motion_features], dim=1) + + net = self.gru(net, inp) + delta_flow = self.flow_head(net) + + # scale mask to balence gradients + mask = .25 * self.mask(net) + return net, mask, delta_flow diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/__init__.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/augmentor.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/augmentor.py new file mode 100644 index 000000000..ff1b70dcb --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/augmentor.py @@ -0,0 +1,286 @@ +import math +import random + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +from PIL import Image +from torchvision.transforms import ColorJitter + +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + + +class FlowAugmentor: + + def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True): + + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = 0.8 + self.stretch_prob = 0.8 + self.max_stretch = 0.2 + + # flip augmentation params + self.do_flip = do_flip + self.h_flip_prob = 0.5 + self.v_flip_prob = 0.1 + + # photometric augmentation params + self.photo_aug = ColorJitter( + brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5 / 3.14) + self.asymmetric_color_aug_prob = 0.2 + self.eraser_aug_prob = 0.5 + + def color_transform(self, img1, img2): + """ Photometric augmentation """ + + # asymmetric + if np.random.rand() < self.asymmetric_color_aug_prob: + img1 = np.array( + self.photo_aug(Image.fromarray(img1)), dtype=np.uint8) + img2 = np.array( + self.photo_aug(Image.fromarray(img2)), dtype=np.uint8) + + # symmetric + else: + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array( + self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) + img1, img2 = np.split(image_stack, 2, axis=0) + + return img1, img2 + + def eraser_transform(self, img1, img2, bounds=[50, 100]): + """ Occlusion augmentation """ + + ht, wd = img1.shape[:2] + if np.random.rand() < self.eraser_aug_prob: + mean_color = np.mean(img2.reshape(-1, 3), axis=0) + for _ in range(np.random.randint(1, 3)): + x0 = np.random.randint(0, wd) + y0 = np.random.randint(0, ht) + dx = np.random.randint(bounds[0], bounds[1]) + dy = np.random.randint(bounds[0], bounds[1]) + img2[y0:y0 + dy, x0:x0 + dx, :] = mean_color + + return img1, img2 + + def spatial_transform(self, img1, img2, flow): + # randomly sample scale + ht, wd = img1.shape[:2] + min_scale = np.maximum((self.crop_size[0] + 8) / float(ht), + (self.crop_size[1] + 8) / float(wd)) + + scale = 2**np.random.uniform(self.min_scale, self.max_scale) + scale_x = scale + scale_y = scale + if np.random.rand() < self.stretch_prob: + scale_x *= 2**np.random.uniform(-self.max_stretch, + self.max_stretch) + scale_y *= 2**np.random.uniform(-self.max_stretch, + self.max_stretch) + + scale_x = np.clip(scale_x, min_scale, None) + scale_y = np.clip(scale_y, min_scale, None) + + if np.random.rand() < self.spatial_aug_prob: + # rescale the images + img1 = cv2.resize( + img1, + None, + fx=scale_x, + fy=scale_y, + interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize( + img2, + None, + fx=scale_x, + fy=scale_y, + interpolation=cv2.INTER_LINEAR) + flow = cv2.resize( + flow, + None, + fx=scale_x, + fy=scale_y, + interpolation=cv2.INTER_LINEAR) + flow = flow * [scale_x, scale_y] + + if self.do_flip: + if np.random.rand() < self.h_flip_prob: # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + + if np.random.rand() < self.v_flip_prob: # v-flip + img1 = img1[::-1, :] + img2 = img2[::-1, :] + flow = flow[::-1, :] * [1.0, -1.0] + + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0]) + x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1]) + + img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + + return img1, img2, flow + + def __call__(self, img1, img2, flow): + img1, img2 = self.color_transform(img1, img2) + img1, img2 = self.eraser_transform(img1, img2) + img1, img2, flow = self.spatial_transform(img1, img2, flow) + + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + + return img1, img2, flow + + +class SparseFlowAugmentor: + + def __init__(self, + crop_size, + min_scale=-0.2, + max_scale=0.5, + do_flip=False): + # spatial augmentation params + self.crop_size = crop_size + self.min_scale = min_scale + self.max_scale = max_scale + self.spatial_aug_prob = 0.8 + self.stretch_prob = 0.8 + self.max_stretch = 0.2 + + # flip augmentation params + self.do_flip = do_flip + self.h_flip_prob = 0.5 + self.v_flip_prob = 0.1 + + # photometric augmentation params + self.photo_aug = ColorJitter( + brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3 / 3.14) + self.asymmetric_color_aug_prob = 0.2 + self.eraser_aug_prob = 0.5 + + def color_transform(self, img1, img2): + image_stack = np.concatenate([img1, img2], axis=0) + image_stack = np.array( + self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8) + img1, img2 = np.split(image_stack, 2, axis=0) + return img1, img2 + + def eraser_transform(self, img1, img2): + ht, wd = img1.shape[:2] + if np.random.rand() < self.eraser_aug_prob: + mean_color = np.mean(img2.reshape(-1, 3), axis=0) + for _ in range(np.random.randint(1, 3)): + x0 = np.random.randint(0, wd) + y0 = np.random.randint(0, ht) + dx = np.random.randint(50, 100) + dy = np.random.randint(50, 100) + img2[y0:y0 + dy, x0:x0 + dx, :] = mean_color + + return img1, img2 + + def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0): + ht, wd = flow.shape[:2] + coords = np.meshgrid(np.arange(wd), np.arange(ht)) + coords = np.stack(coords, axis=-1) + + coords = coords.reshape(-1, 2).astype(np.float32) + flow = flow.reshape(-1, 2).astype(np.float32) + valid = valid.reshape(-1).astype(np.float32) + + coords0 = coords[valid >= 1] + flow0 = flow[valid >= 1] + + ht1 = int(round(ht * fy)) + wd1 = int(round(wd * fx)) + + coords1 = coords0 * [fx, fy] + flow1 = flow0 * [fx, fy] + + xx = np.round(coords1[:, 0]).astype(np.int32) + yy = np.round(coords1[:, 1]).astype(np.int32) + + v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1) + xx = xx[v] + yy = yy[v] + flow1 = flow1[v] + + flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32) + valid_img = np.zeros([ht1, wd1], dtype=np.int32) + + flow_img[yy, xx] = flow1 + valid_img[yy, xx] = 1 + + return flow_img, valid_img + + def spatial_transform(self, img1, img2, flow, valid): + # randomly sample scale + + ht, wd = img1.shape[:2] + min_scale = np.maximum((self.crop_size[0] + 1) / float(ht), + (self.crop_size[1] + 1) / float(wd)) + + scale = 2**np.random.uniform(self.min_scale, self.max_scale) + scale_x = np.clip(scale, min_scale, None) + scale_y = np.clip(scale, min_scale, None) + + if np.random.rand() < self.spatial_aug_prob: + # rescale the images + img1 = cv2.resize( + img1, + None, + fx=scale_x, + fy=scale_y, + interpolation=cv2.INTER_LINEAR) + img2 = cv2.resize( + img2, + None, + fx=scale_x, + fy=scale_y, + interpolation=cv2.INTER_LINEAR) + flow, valid = self.resize_sparse_flow_map( + flow, valid, fx=scale_x, fy=scale_y) + + if self.do_flip: + if np.random.rand() < 0.5: # h-flip + img1 = img1[:, ::-1] + img2 = img2[:, ::-1] + flow = flow[:, ::-1] * [-1.0, 1.0] + valid = valid[:, ::-1] + + margin_y = 20 + margin_x = 50 + + y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y) + x0 = np.random.randint(-margin_x, + img1.shape[1] - self.crop_size[1] + margin_x) + + y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0]) + x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1]) + + img1 = img1[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + img2 = img2[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + flow = flow[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + valid = valid[y0:y0 + self.crop_size[0], x0:x0 + self.crop_size[1]] + return img1, img2, flow, valid + + def __call__(self, img1, img2, flow, valid): + img1, img2 = self.color_transform(img1, img2) + img1, img2 = self.eraser_transform(img1, img2) + img1, img2, flow, valid = self.spatial_transform( + img1, img2, flow, valid) + + img1 = np.ascontiguousarray(img1) + img2 = np.ascontiguousarray(img2) + flow = np.ascontiguousarray(flow) + valid = np.ascontiguousarray(valid) + + return img1, img2, flow, valid diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/flow_viz.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/flow_viz.py new file mode 100644 index 000000000..46c92e348 --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/flow_viz.py @@ -0,0 +1,132 @@ +# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization + +# MIT License +# +# Copyright (c) 2018 Tom Runia +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to conditions. +# +# Author: Tom Runia +# Date Created: 2018-08-03 + +import numpy as np + + +def make_colorwheel(): + """ + Generates a color wheel for optical flow visualization as presented in: + Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007) + URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf + + Code follows the original C++ source code of Daniel Scharstein. + Code follows the the Matlab source code of Deqing Sun. + + Returns: + np.ndarray: Color wheel + """ + + RY = 15 + YG = 6 + GC = 4 + CB = 11 + BM = 13 + MR = 6 + + ncols = RY + YG + GC + CB + BM + MR + colorwheel = np.zeros((ncols, 3)) + col = 0 + + # RY + colorwheel[0:RY, 0] = 255 + colorwheel[0:RY, 1] = np.floor(255 * np.arange(0, RY) / RY) + col = col + RY + # YG + colorwheel[col:col + YG, 0] = 255 - np.floor(255 * np.arange(0, YG) / YG) + colorwheel[col:col + YG, 1] = 255 + col = col + YG + # GC + colorwheel[col:col + GC, 1] = 255 + colorwheel[col:col + GC, 2] = np.floor(255 * np.arange(0, GC) / GC) + col = col + GC + # CB + colorwheel[col:col + CB, 1] = 255 - np.floor(255 * np.arange(CB) / CB) + colorwheel[col:col + CB, 2] = 255 + col = col + CB + # BM + colorwheel[col:col + BM, 2] = 255 + colorwheel[col:col + BM, 0] = np.floor(255 * np.arange(0, BM) / BM) + col = col + BM + # MR + colorwheel[col:col + MR, 2] = 255 - np.floor(255 * np.arange(MR) / MR) + colorwheel[col:col + MR, 0] = 255 + return colorwheel + + +def flow_uv_to_colors(u, v, convert_to_bgr=False): + """ + Applies the flow color wheel to (possibly clipped) flow components u and v. + + According to the C++ source code of Daniel Scharstein + According to the Matlab source code of Deqing Sun + + Args: + u (np.ndarray): Input horizontal flow of shape [H,W] + v (np.ndarray): Input vertical flow of shape [H,W] + convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. + + Returns: + np.ndarray: Flow visualization image of shape [H,W,3] + """ + flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8) + colorwheel = make_colorwheel() # shape [55x3] + ncols = colorwheel.shape[0] + rad = np.sqrt(np.square(u) + np.square(v)) + a = np.arctan2(-v, -u) / np.pi + fk = (a + 1) / 2 * (ncols - 1) + k0 = np.floor(fk).astype(np.int32) + k1 = k0 + 1 + k1[k1 == ncols] = 0 + f = fk - k0 + for i in range(colorwheel.shape[1]): + tmp = colorwheel[:, i] + col0 = tmp[k0] / 255.0 + col1 = tmp[k1] / 255.0 + col = (1 - f) * col0 + f * col1 + idx = (rad <= 1) + col[idx] = 1 - rad[idx] * (1 - col[idx]) + col[~idx] = col[~idx] * 0.75 # out of range + # Note the 2-i => BGR instead of RGB + ch_idx = 2 - i if convert_to_bgr else i + flow_image[:, :, ch_idx] = np.floor(255 * col) + return flow_image + + +def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False): + """ + Expects a two dimensional flow image of shape. + + Args: + flow_uv (np.ndarray): Flow UV image of shape [H,W,2] + clip_flow (float, optional): Clip maximum of flow values. Defaults to None. + convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False. + + Returns: + np.ndarray: Flow visualization image of shape [H,W,3] + """ + assert flow_uv.ndim == 3, 'input flow must have three dimensions' + assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]' + if clip_flow is not None: + flow_uv = np.clip(flow_uv, 0, clip_flow) + u = flow_uv[:, :, 0] + v = flow_uv[:, :, 1] + rad = np.sqrt(np.square(u) + np.square(v)) + rad_max = np.max(rad) + epsilon = 1e-5 + u = u / (rad_max + epsilon) + v = v / (rad_max + epsilon) + return flow_uv_to_colors(u, v, convert_to_bgr) diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/frame_utils.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/frame_utils.py new file mode 100644 index 000000000..dac10fe1e --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/frame_utils.py @@ -0,0 +1,142 @@ +import re +from os.path import * + +import cv2 +import numpy as np +from PIL import Image + +cv2.setNumThreads(0) +cv2.ocl.setUseOpenCL(False) + +TAG_CHAR = np.array([202021.25], np.float32) + + +def readFlow(fn): + """ Read .flo file in Middlebury format""" + # Code adapted from: + # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy + + # WARNING: this will work on little-endian architectures (eg Intel x86) only! + # print 'fn = %s'%(fn) + with open(fn, 'rb') as f: + magic = np.fromfile(f, np.float32, count=1) + if 202021.25 != magic: + print('Magic number incorrect. Invalid .flo file') + return None + else: + w = np.fromfile(f, np.int32, count=1) + h = np.fromfile(f, np.int32, count=1) + # print 'Reading %d x %d flo file\n' % (w, h) + data = np.fromfile(f, np.float32, count=2 * int(w) * int(h)) + # Reshape data into 3D array (columns, rows, bands) + # The reshape here is for visualization, the original code is (w,h,2) + return np.resize(data, (int(h), int(w), 2)) + + +def readPFM(file): + file = open(file, 'rb') + + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().rstrip() + if header == b'PF': + color = True + elif header == b'Pf': + color = False + else: + raise Exception('Not a PFM file.') + + dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline()) + if dim_match: + width, height = map(int, dim_match.groups()) + else: + raise Exception('Malformed PFM header.') + + scale = float(file.readline().rstrip()) + if scale < 0: # little-endian + endian = '<' + scale = -scale + else: + endian = '>' # big-endian + + data = np.fromfile(file, endian + 'f') + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + return data + + +def writeFlow(filename, uv, v=None): + """ Write optical flow to file. + + If v is None, uv is assumed to contain both u and v channels, + stacked in depth. + Original code by Deqing Sun, adapted from Daniel Scharstein. + """ + nBands = 2 + + if v is None: + assert (uv.ndim == 3) + assert (uv.shape[2] == 2) + u = uv[:, :, 0] + v = uv[:, :, 1] + else: + u = uv + + assert (u.shape == v.shape) + height, width = u.shape + f = open(filename, 'wb') + # write the header + f.write(TAG_CHAR) + np.array(width).astype(np.int32).tofile(f) + np.array(height).astype(np.int32).tofile(f) + # arrange into matrix form + tmp = np.zeros((height, width * nBands)) + tmp[:, np.arange(width) * 2] = u + tmp[:, np.arange(width) * 2 + 1] = v + tmp.astype(np.float32).tofile(f) + f.close() + + +def readFlowKITTI(filename): + flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) + flow = flow[:, :, ::-1].astype(np.float32) + flow, valid = flow[:, :, :2], flow[:, :, 2] + flow = (flow - 2**15) / 64.0 + return flow, valid + + +def readDispKITTI(filename): + disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0 + valid = disp > 0.0 + flow = np.stack([-disp, np.zeros_like(disp)], -1) + return flow, valid + + +def writeFlowKITTI(filename, uv): + uv = 64.0 * uv + 2**15 + valid = np.ones([uv.shape[0], uv.shape[1], 1]) + uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16) + cv2.imwrite(filename, uv[..., ::-1]) + + +def read_gen(file_name, pil=False): + ext = splitext(file_name)[-1] + if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg': + return Image.open(file_name) + elif ext == '.bin' or ext == '.raw': + return np.load(file_name) + elif ext == '.flo': + return readFlow(file_name).astype(np.float32) + elif ext == '.pfm': + flow = readPFM(file_name).astype(np.float32) + if len(flow.shape) == 2: + return flow + else: + return flow[:, :, :-1] + return [] diff --git a/modelscope/models/cv/dense_optical_flow_estimation/core/utils/utils.py b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/utils.py new file mode 100644 index 000000000..6228e6ef4 --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/core/utils/utils.py @@ -0,0 +1,93 @@ +import numpy as np +import torch +import torch.nn.functional as F +from scipy import interpolate + + +class InputPadder: + """ Pads images such that dimensions are divisible by 8 """ + + def __init__(self, dims, mode='sintel'): + self.ht, self.wd = dims[-2:] + pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8 + pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8 + if mode == 'sintel': + self._pad = [ + pad_wd // 2, pad_wd - pad_wd // 2, pad_ht // 2, + pad_ht - pad_ht // 2 + ] + else: + self._pad = [pad_wd // 2, pad_wd - pad_wd // 2, 0, pad_ht] + + def pad(self, *inputs): + return [F.pad(x, self._pad, mode='replicate') for x in inputs] + + def unpad(self, x): + ht, wd = x.shape[-2:] + c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] + return x[..., c[0]:c[1], c[2]:c[3]] + + +def forward_interpolate(flow): + flow = flow.detach().cpu().numpy() + dx, dy = flow[0], flow[1] + + ht, wd = dx.shape + x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht)) + + x1 = x0 + dx + y1 = y0 + dy + + x1 = x1.reshape(-1) + y1 = y1.reshape(-1) + dx = dx.reshape(-1) + dy = dy.reshape(-1) + + valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht) + x1 = x1[valid] + y1 = y1[valid] + dx = dx[valid] + dy = dy[valid] + + flow_x = interpolate.griddata((x1, y1), + dx, (x0, y0), + method='nearest', + fill_value=0) + + flow_y = interpolate.griddata((x1, y1), + dy, (x0, y0), + method='nearest', + fill_value=0) + + flow = np.stack([flow_x, flow_y], axis=0) + return torch.from_numpy(flow).float() + + +def bilinear_sampler(img, coords, mode='bilinear', mask=False): + """ Wrapper for grid_sample, uses pixel coordinates """ + H, W = img.shape[-2:] + xgrid, ygrid = coords.split([1, 1], dim=-1) + xgrid = 2 * xgrid / (W - 1) - 1 + ygrid = 2 * ygrid / (H - 1) - 1 + + grid = torch.cat([xgrid, ygrid], dim=-1) + img = F.grid_sample(img, grid, align_corners=True) + + if mask: + mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1) + return img, mask.float() + + return img + + +def coords_grid(batch, ht, wd, device): + coords = torch.meshgrid( + torch.arange(ht, device=device), torch.arange(wd, device=device)) + coords = torch.stack(coords[::-1], dim=0).float() + return coords[None].repeat(batch, 1, 1, 1) + + +def upflow8(flow, mode='bilinear'): + new_size = (8 * flow.shape[2], 8 * flow.shape[3]) + return 8 * F.interpolate( + flow, size=new_size, mode=mode, align_corners=True) diff --git a/modelscope/models/cv/dense_optical_flow_estimation/raft_model.py b/modelscope/models/cv/dense_optical_flow_estimation/raft_model.py new file mode 100644 index 000000000..2363092ae --- /dev/null +++ b/modelscope/models/cv/dense_optical_flow_estimation/raft_model.py @@ -0,0 +1,52 @@ +import argparse +import os.path as osp + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.dense_optical_flow_estimation.core.raft import RAFT +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks + + +@MODELS.register_module( + Tasks.dense_optical_flow_estimation, + module_name=Models.raft_dense_optical_flow_estimation) +class DenseOpticalFlowEstimation(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + # build model + args = argparse.Namespace() + args.model = model_dir + args.small = False + args.mixed_precision = False + args.alternate_corr = False + self.model = torch.nn.DataParallel(RAFT(args)) + + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + self.model.load_state_dict(torch.load(model_path)) + self.model = self.model.module + self.model.to('cuda') + self.model.eval() + + def forward(self, Inputs): + image1 = Inputs['image1'] + image2 = Inputs['image2'] + + flow_ups = self.model(image1, image2) + flow_up = flow_ups[-1] + + return flow_up + + def postprocess(self, inputs): + results = {OutputKeys.FLOWS: inputs} + return results + + def inference(self, data): + results = self.forward(data) + return results diff --git a/modelscope/models/cv/face_detection/mogface/models/resnet.py b/modelscope/models/cv/face_detection/mogface/models/resnet.py index 045f6fa37..dc0023c3b 100644 --- a/modelscope/models/cv/face_detection/mogface/models/resnet.py +++ b/modelscope/models/cv/face_detection/mogface/models/resnet.py @@ -1,6 +1,6 @@ -# The implementation is modified from original resent implementaiton, which is -# also open-sourced by the authors as Yang Liu, -# and is available publicly on https://github.com/damo-cv/MogFace +# The implementation is modified from original resent implementation, which is +# also open-sourced by the authors as Yang Liu, +# and is available publicly on https://github.com/damo-cv/MogFace import torch.nn as nn diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py index 11a59302f..545cfb18e 100644 --- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/backbones/master_net.py @@ -27,7 +27,7 @@ def __init__(self, """ Any ReLU-CNN Backbone Args: - plainet_struct: (obj: str): + plainnet_struct: (obj: str): Str of network topology structure. no_reslink: (obj:bool): no use residual structure. diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py index 3bae34d83..cee49276c 100644 --- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/base.py @@ -1,5 +1,5 @@ """ -The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at +The implementation here is modified based on insightface, originally MIT license and publicly available at https://github.com/deepinsight/insightface/blob/master/detection/scrfd/mmdet/models/detectors/base.py """ from abc import ABCMeta, abstractmethod diff --git a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py index 117eaa82a..9f77f7953 100644 --- a/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py +++ b/modelscope/models/cv/face_detection/scrfd/mmdet_patch/models/detectors/single_stage.py @@ -1,5 +1,5 @@ """ -The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at +The implementation here is modified based on insightface, originally MIT license and publicly available at https://github.com/deepinsight/insightface/blob/master/detection/scrfd/mmdet/models/detectors/single_stage.py """ import torch diff --git a/modelscope/models/cv/face_emotion/efficient/utils.py b/modelscope/models/cv/face_emotion/efficient/utils.py index c1fcd9b3c..e4a79ac65 100644 --- a/modelscope/models/cv/face_emotion/efficient/utils.py +++ b/modelscope/models/cv/face_emotion/efficient/utils.py @@ -207,7 +207,7 @@ def forward(self, x): class Conv2dStaticSamePadding(nn.Conv2d): """2D Convolutions like TensorFlow's 'SAME' mode, with the given input image size. - The padding mudule is calculated in construction function, then used in forward. + The padding module is calculated in construction function, then used in forward. """ def __init__(self, diff --git a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py index cad6cfe00..91d5379a3 100644 --- a/modelscope/models/cv/face_human_hand_detection/ghost_pan.py +++ b/modelscope/models/cv/face_human_hand_detection/ghost_pan.py @@ -186,7 +186,7 @@ class GhostBlocks(nn.Module): out_channels (int): Number of output channels. expand (int): Expand ratio of GhostBottleneck. Default: 1. kernel_size (int): Kernel size of depthwise convolution. Default: 5. - num_blocks (int): Number of GhostBottlecneck blocks. Default: 1. + num_blocks (int): Number of GhostBottleneck blocks. Default: 1. use_res (bool): Whether to use residual connection. Default: False. activation (str): Name of activation function. Default: LeakyReLU. """ @@ -242,7 +242,7 @@ class GhostPAN(nn.Module): blocks. Default: False kernel_size (int): Kernel size of depthwise convolution. Default: 5. expand (int): Expand ratio of GhostBottleneck. Default: 1. - num_blocks (int): Number of GhostBottlecneck blocks. Default: 1. + num_blocks (int): Number of GhostBottleneck blocks. Default: 1. use_res (bool): Whether to use residual connection. Default: False. num_extra_level (int): Number of extra conv layers for more feature levels. Default: 0. diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py index afe899632..b8c0eeb52 100755 --- a/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py +++ b/modelscope/models/cv/face_recognition/torchkit/backbone/__init__.py @@ -1,4 +1,4 @@ -# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at +# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone from .model_irse import (IR_18, IR_34, IR_50, IR_101, IR_152, IR_200, IR_SE_50, IR_SE_101, IR_SE_152, IR_SE_200) diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py b/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py index 25b9fe332..ed0f41f8b 100644 --- a/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py +++ b/modelscope/models/cv/face_recognition/torchkit/backbone/arcface_backbone.py @@ -1,4 +1,4 @@ -# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at +# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at # https://github.com/deepinsight/insightface/blob/master/recognition/arcface_torch/backbones/iresnet.py import torch from torch import nn diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py index a1683225e..9876bd291 100755 --- a/modelscope/models/cv/face_recognition/torchkit/backbone/common.py +++ b/modelscope/models/cv/face_recognition/torchkit/backbone/common.py @@ -1,4 +1,4 @@ -# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at +# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/common.py import torch import torch.nn as nn @@ -7,7 +7,7 @@ def initialize_weights(modules): - """ Weight initilize, conv2d and linear is initialized with kaiming_normal + """ Weight initialize, conv2d and linear is initialized with kaiming_normal """ for m in modules: if isinstance(m, nn.Conv2d): diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py b/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py index c9e01367e..d049ea42e 100644 --- a/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py +++ b/modelscope/models/cv/face_recognition/torchkit/backbone/facemask_backbone.py @@ -1,4 +1,4 @@ -# The implementation is adopted from InsightFace, made pubicly available under the Apache-2.0 license at +# The implementation is adopted from InsightFace, made publicly available under the Apache-2.0 license at # https://github.com/TreB1eN/InsightFace_Pytorch/blob/master/model.py from collections import namedtuple diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py index 1982ca059..8e9f5f530 100755 --- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py +++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_irse.py @@ -1,4 +1,4 @@ -# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at +# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_irse.py from collections import namedtuple diff --git a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py index 568e24ffc..479e7dd4e 100755 --- a/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py +++ b/modelscope/models/cv/face_recognition/torchkit/backbone/model_resnet.py @@ -1,4 +1,4 @@ -# The implementation is adopted from TFace,made pubicly available under the Apache-2.0 license at +# The implementation is adopted from TFace,made publicly available under the Apache-2.0 license at # https://github.com/Tencent/TFace/blob/master/recognition/torchkit/backbone/model_resnet.py import torch.nn as nn from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear, diff --git a/modelscope/models/cv/face_reconstruction/models/facerecon_model.py b/modelscope/models/cv/face_reconstruction/models/facerecon_model.py index d753b4163..008e0780f 100644 --- a/modelscope/models/cv/face_reconstruction/models/facerecon_model.py +++ b/modelscope/models/cv/face_reconstruction/models/facerecon_model.py @@ -104,7 +104,7 @@ def __init__(self, zfar=opt.z_far, rasterize_size=int(2 * opt.center)) - self.comupte_color_loss = photo_loss + self.compute_color_loss = photo_loss def set_device(self, device): self.device = device @@ -444,7 +444,7 @@ def forward(self, visualize=False): self.facemodel_front.face_buf, self.bfm_UVs.clone(), pred_color_high) - loss_color_high = self.w_color * self.comupte_color_loss( + loss_color_high = self.w_color * self.compute_color_loss( pred_face_high, self.input_img_for_tex, self.pred_mask.detach()) loss_smooth = TVLoss()(texture_offset) * self.w_tex_smooth diff --git a/modelscope/models/cv/face_reconstruction/models/losses.py b/modelscope/models/cv/face_reconstruction/models/losses.py index 6d4af4e8d..c04a81661 100644 --- a/modelscope/models/cv/face_reconstruction/models/losses.py +++ b/modelscope/models/cv/face_reconstruction/models/losses.py @@ -49,7 +49,7 @@ def perceptual_loss(id_featureA, id_featureB): # image level loss def photo_loss(imageA, imageB, mask, eps=1e-6): """ - l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur) + l2 norm (with sqrt, to ensure backward stability, use eps, otherwise Nan may occur) Parameters: imageA --torch.tensor (B, 3, H, W), range (0, 1), RGB order imageB --same as imageA @@ -170,7 +170,7 @@ def _tensor_size(self, t): def photo_loss_sum(imageA, imageB, mask, eps=1e-6): """ - l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur) + l2 norm (with sqrt, to ensure backward stability, use eps, otherwise Nan may occur) Parameters: imageA --torch.tensor (B, 3, H, W), range (0, 1), RGB order imageB --same as imageA diff --git a/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py b/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py index c18881edc..5a8a4709e 100644 --- a/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py +++ b/modelscope/models/cv/face_reconstruction/models/pix2pix/networks.py @@ -322,7 +322,7 @@ def get_target_tensor(self, prediction, target_is_real): """Create label tensors with the same size as the input. Parameters: - prediction (tensor) - - tpyically the prediction from a discriminator + prediction (tensor) - - typically the prediction from a discriminator target_is_real (bool) - - if the ground truth label is for real images or fake images Returns: @@ -336,10 +336,10 @@ def get_target_tensor(self, prediction, target_is_real): return target_tensor.expand_as(prediction) def __call__(self, prediction, target_is_real): - """Calculate loss given Discriminator's output and grount truth labels. + """Calculate loss given Discriminator's output and ground truth labels. Parameters: - prediction (tensor) - - tpyically the prediction output from a discriminator + prediction (tensor) - - typically the prediction output from a discriminator target_is_real (bool) - - if the ground truth label is for real images or fake images Returns: diff --git a/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py b/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py index 54768fc1c..b9c2c9000 100644 --- a/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py +++ b/modelscope/models/cv/face_reconstruction/models/pix2pix/pix2pix_model.py @@ -13,7 +13,7 @@ class Pix2PixModel(nn.Module): The model training requires '--dataset_mode aligned' dataset. By default, it uses a '--netG unet256' U-Net generator, a '--netD basic' discriminator (PatchGAN), - and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the orignal GAN paper). + and a '--gan_mode' vanilla GAN loss (the cross-entropy objective used in the original GAN paper). pix2pix paper: https://arxiv.org/pdf/1611.07004.pdf """ @@ -121,5 +121,5 @@ def optimize_parameters(self): self.set_requires_grad( self.netD, False) # D requires no gradients when optimizing G self.optimizer_G.zero_grad() # set G's gradients to zero - self.backward_G() # calculate graidents for G - self.optimizer_G.step() # udpate G's weights + self.backward_G() # calculate gradients for G + self.optimizer_G.step() # update G's weights diff --git a/modelscope/models/cv/face_reconstruction/models/renderer.py b/modelscope/models/cv/face_reconstruction/models/renderer.py index d10fd5604..bfe166b0c 100755 --- a/modelscope/models/cv/face_reconstruction/models/renderer.py +++ b/modelscope/models/cv/face_reconstruction/models/renderer.py @@ -20,7 +20,7 @@ def set_rasterizer(): class Pytorch3dRasterizer(nn.Module): - # TODO: add support for rendering non-squared images, since pytorc3d supports this now + # TODO: add support for rendering non-squared images, since pytorch3d supports this now """ Borrowed from https://github.com/facebookresearch/pytorch3d Notice: x,y,z are in image space, normalized @@ -158,7 +158,7 @@ def forward(self, -- Texture Rendering vertices: [batch_size, V, 3], vertices in world space, for calculating normals, then shading transformed_vertices: [batch_size, V, 3], range:normalized to [-1,1], projected vertices in image space - (that is aligned to the iamge pixel), for rasterization + (that is aligned to the image pixel), for rasterization albedos: [batch_size, 3, h, w], uv map lights: spherical homarnic: [N, 9(shcoeff), 3(rgb)] diff --git a/modelscope/models/cv/face_reconstruction/utils.py b/modelscope/models/cv/face_reconstruction/utils.py index 655d8b2a7..f23b2f707 100644 --- a/modelscope/models/cv/face_reconstruction/utils.py +++ b/modelscope/models/cv/face_reconstruction/utils.py @@ -767,6 +767,7 @@ def align_img(img, lm, lm3D, mask=None, target_size=224., rescale_factor=102.): # calculate translation and scale factors using 5 facial landmarks and standard landmarks of a 3D face t, s = POS(lm5p.transpose(), lm3D.transpose()) + t = t.squeeze() s = rescale_factor / s # processing the image diff --git a/modelscope/models/cv/facial_68ldk_detection/__init__.py b/modelscope/models/cv/facial_68ldk_detection/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/facial_68ldk_detection/conf/__init__.py b/modelscope/models/cv/facial_68ldk_detection/conf/__init__.py new file mode 100644 index 000000000..4690762b4 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/conf/__init__.py @@ -0,0 +1 @@ +from .alignment import Alignment diff --git a/modelscope/models/cv/facial_68ldk_detection/conf/alignment.py b/modelscope/models/cv/facial_68ldk_detection/conf/alignment.py new file mode 100644 index 000000000..30b5773d7 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/conf/alignment.py @@ -0,0 +1,353 @@ +import os.path as osp + +from .base import Base + + +class Alignment(Base): + """ + Alignment configure file, which contains training parameters of alignment. + """ + + def __init__(self, args): + super(Alignment, self).__init__('alignment') + self.ckpt_dir = '/mnt/workspace/humanAIGC/project/STAR/weights' + self.net = 'stackedHGnet_v1' + self.nstack = 4 + self.loader_type = 'alignment' + self.data_definition = '300W' # COFW, 300W, WFLW + self.test_file = 'test.tsv' + + # image + self.channels = 3 + self.width = 256 + self.height = 256 + self.means = (127.5, 127.5, 127.5) + self.scale = 1 / 127.5 + self.aug_prob = 1.0 + + self.display_iteration = 10 + self.val_epoch = 1 + self.valset = 'test.tsv' + self.norm_type = 'default' + self.encoder_type = 'default' + self.decoder_type = 'default' + + # scheduler & optimizer + self.milestones = [200, 350, 450] + self.max_epoch = 260 + self.optimizer = 'adam' + self.learn_rate = 0.001 + self.weight_decay = 0.00001 + self.betas = [0.9, 0.999] + self.gamma = 0.1 + + # batch_size & workers + self.batch_size = 32 + self.train_num_workers = 16 + self.val_batch_size = 32 + self.val_num_workers = 16 + self.test_batch_size = 16 + self.test_num_workers = 0 + + # tricks + self.ema = True + self.add_coord = True + self.use_AAM = True + + # loss + self.loss_func = 'STARLoss_v2' + + # STAR Loss paras + self.star_w = 1 + self.star_dist = 'smoothl1' + + self.init_from_args(args) + + # COFW + if self.data_definition == 'COFW': + self.edge_info = ( + (True, (0, 4, 2, 5)), # RightEyebrow + (True, (1, 6, 3, 7)), # LeftEyebrow + (True, (8, 12, 10, 13)), # RightEye + (False, (9, 14, 11, 15)), # LeftEye + (True, (18, 20, 19, 21)), # Nose + (True, (22, 26, 23, 27)), # LowerLip + (True, (22, 24, 23, 25)), # UpperLip + ) + if self.norm_type == 'ocular': + self.nme_left_index = 8 # ocular + self.nme_right_index = 9 # ocular + elif self.norm_type in ['pupil', 'default']: + self.nme_left_index = 16 # pupil + self.nme_right_index = 17 # pupil + else: + raise NotImplementedError + self.classes_num = [29, 7, 29] + self.crop_op = True + self.flip_mapping = ( + [0, 1], + [4, 6], + [2, 3], + [5, 7], + [8, 9], + [10, 11], + [12, 14], + [16, 17], + [13, 15], + [18, 19], + [22, 23], + ) + self.image_dir = osp.join(self.image_dir, 'COFW') + # 300W + elif self.data_definition == '300W': + self.edge_info = ( + (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16)), # FaceContour + (False, (17, 18, 19, 20, 21)), # RightEyebrow + (False, (22, 23, 24, 25, 26)), # LeftEyebrow + (False, (27, 28, 29, 30)), # NoseLine + (False, (31, 32, 33, 34, 35)), # Nose + (True, (36, 37, 38, 39, 40, 41)), # RightEye + (True, (42, 43, 44, 45, 46, 47)), # LeftEye + (True, (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59)), # OuterLip + (True, (60, 61, 62, 63, 64, 65, 66, 67)), # InnerLip + ) + if self.norm_type in ['ocular', 'default']: + self.nme_left_index = 36 # ocular + self.nme_right_index = 45 # ocular + elif self.norm_type == 'pupil': + self.nme_left_index = [36, 37, 38, 39, 40, 41] # pupil + self.nme_right_index = [42, 43, 44, 45, 46, 47] # pupil + else: + raise NotImplementedError + self.classes_num = [68, 9, 68] + self.crop_op = True + self.flip_mapping = ( + [0, 16], + [1, 15], + [2, 14], + [3, 13], + [4, 12], + [5, 11], + [6, 10], + [7, 9], + [17, 26], + [18, 25], + [19, 24], + [20, 23], + [21, 22], + [31, 35], + [32, 34], + [36, 45], + [37, 44], + [38, 43], + [39, 42], + [40, 47], + [41, 46], + [48, 54], + [49, 53], + [50, 52], + [61, 63], + [60, 64], + [67, 65], + [58, 56], + [59, 55], + ) + self.image_dir = osp.join(self.image_dir, '300W') + # self.image_dir = osp.join(self.image_dir, '300VW_images') + # 300VW + elif self.data_definition == '300VW': + self.edge_info = ( + (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16)), # FaceContour + (False, (17, 18, 19, 20, 21)), # RightEyebrow + (False, (22, 23, 24, 25, 26)), # LeftEyebrow + (False, (27, 28, 29, 30)), # NoseLine + (False, (31, 32, 33, 34, 35)), # Nose + (True, (36, 37, 38, 39, 40, 41)), # RightEye + (True, (42, 43, 44, 45, 46, 47)), # LeftEye + (True, (48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, + 59)), # OuterLip + (True, (60, 61, 62, 63, 64, 65, 66, 67)), # InnerLip + ) + if self.norm_type in ['ocular', 'default']: + self.nme_left_index = 36 # ocular + self.nme_right_index = 45 # ocular + elif self.norm_type == 'pupil': + self.nme_left_index = [36, 37, 38, 39, 40, 41] # pupil + self.nme_right_index = [42, 43, 44, 45, 46, 47] # pupil + else: + raise NotImplementedError + self.classes_num = [68, 9, 68] + self.crop_op = True + self.flip_mapping = ( + [0, 16], + [1, 15], + [2, 14], + [3, 13], + [4, 12], + [5, 11], + [6, 10], + [7, 9], + [17, 26], + [18, 25], + [19, 24], + [20, 23], + [21, 22], + [31, 35], + [32, 34], + [36, 45], + [37, 44], + [38, 43], + [39, 42], + [40, 47], + [41, 46], + [48, 54], + [49, 53], + [50, 52], + [61, 63], + [60, 64], + [67, 65], + [58, 56], + [59, 55], + ) + self.image_dir = osp.join(self.image_dir, + '300VW_Dataset_2015_12_14') + # WFLW + elif self.data_definition == 'WFLW': + self.edge_info = ( + (False, (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31, 32)), # FaceContour + (True, (33, 34, 35, 36, 37, 38, 39, 40, 41)), # RightEyebrow + (True, (42, 43, 44, 45, 46, 47, 48, 49, 50)), # LeftEyebrow + (False, (51, 52, 53, 54)), # NoseLine + (False, (55, 56, 57, 58, 59)), # Nose + (True, (60, 61, 62, 63, 64, 65, 66, 67)), # RightEye + (True, (68, 69, 70, 71, 72, 73, 74, 75)), # LeftEye + (True, (76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, + 87)), # OuterLip + (True, (88, 89, 90, 91, 92, 93, 94, 95)), # InnerLip + ) + if self.norm_type in ['ocular', 'default']: + self.nme_left_index = 60 # ocular + self.nme_right_index = 72 # ocular + elif self.norm_type == 'pupil': + self.nme_left_index = 96 # pupils + self.nme_right_index = 97 # pupils + else: + raise NotImplementedError + self.classes_num = [98, 9, 98] + self.crop_op = True + self.flip_mapping = ( + [0, 32], + [1, 31], + [2, 30], + [3, 29], + [4, 28], + [5, 27], + [6, 26], + [7, 25], + [8, 24], + [9, 23], + [10, 22], + [11, 21], + [12, 20], + [13, 19], + [14, 18], + [15, 17], # cheek + [33, 46], + [34, 45], + [35, 44], + [36, 43], + [37, 42], + [38, 50], + [39, 49], + [40, 48], + [41, 47], # elbrow + [60, 72], + [61, 71], + [62, 70], + [63, 69], + [64, 68], + [65, 75], + [66, 74], + [67, 73], + [55, 59], + [56, 58], + [76, 82], + [77, 81], + [78, 80], + [87, 83], + [86, 84], + [88, 92], + [89, 91], + [95, 93], + [96, 97]) + self.image_dir = osp.join(self.image_dir, 'WFLW', 'WFLW_images') + + self.label_num = self.nstack * 3 if self.use_AAM else self.nstack + self.loss_weights, self.criterions, self.metrics = [], [], [] + for i in range(self.nstack): + factor = (2**i) / (2**(self.nstack - 1)) + if self.use_AAM: + self.loss_weights += [ + factor * weight for weight in [1.0, 10.0, 10.0] + ] + self.criterions += [self.loss_func, 'AWingLoss', 'AWingLoss'] + self.metrics += ['NME', None, None] + else: + self.loss_weights += [factor * weight for weight in [1.0]] + self.criterions += [ + self.loss_func, + ] + self.metrics += [ + 'NME', + ] + + self.key_metric_index = (self.nstack - 1) * 3 if self.use_AAM else ( + self.nstack - 1) + + # data + self.folder = self.get_foldername() + self.work_dir = osp.join(self.ckpt_dir, self.data_definition, + self.folder) + self.model_dir = osp.join(self.work_dir, 'model') + self.log_dir = osp.join(self.work_dir, 'log') + + self.train_tsv_file = osp.join(self.annot_dir, self.data_definition, + 'train.tsv') + self.train_pic_dir = self.image_dir + + self.val_tsv_file = osp.join(self.annot_dir, self.data_definition, + self.valset) + self.val_pic_dir = self.image_dir + + self.test_tsv_file = osp.join(self.annot_dir, self.data_definition, + self.test_file) + self.test_pic_dir = self.image_dir + + # self.train_tsv_file = osp.join(self.annot_dir, '300VW', "train.tsv") + # self.train_pic_dir = self.image_dir + + # self.val_tsv_file = osp.join(self.annot_dir, '300VW', self.valset) + # self.val_pic_dir = self.image_dir + + # self.test_tsv_file = osp.join(self.annot_dir, '300VW', self.test_file) + # self.test_pic_dir = self.image_dir + + def get_foldername(self): + str = '' + str += '{}_{}x{}_{}_ep{}_lr{}_bs{}'.format( + self.data_definition, self.height, self.width, self.optimizer, + self.max_epoch, self.learn_rate, self.batch_size) + str += '_{}'.format(self.loss_func) + str += '_{}_{}'.format( + self.star_dist, + self.star_w) if self.loss_func == 'STARLoss' else '' + str += '_AAM' if self.use_AAM else '' + str += '_{}'.format( + self.valset[:-4]) if self.valset != 'test.tsv' else '' + str += '_{}'.format(self.id) + return str diff --git a/modelscope/models/cv/facial_68ldk_detection/conf/base.py b/modelscope/models/cv/facial_68ldk_detection/conf/base.py new file mode 100644 index 000000000..304505241 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/conf/base.py @@ -0,0 +1,102 @@ +import logging +import os.path as osp +import uuid +from argparse import Namespace + +# from tensorboardX import SummaryWriter + + +class Base: + """ + Base configure file, which contains the basic training parameters + and should be inherited by other attribute configure file. + """ + + def __init__(self, + config_name, + ckpt_dir='./', + image_dir='./', + annot_dir='./'): + self.type = config_name + self.id = str(uuid.uuid4()) + self.note = '' + + self.ckpt_dir = ckpt_dir + self.image_dir = image_dir + self.annot_dir = annot_dir + + self.loader_type = 'alignment' + self.loss_func = 'STARLoss' + + # train + self.batch_size = 128 + self.val_batch_size = 1 + self.test_batch_size = 32 + self.channels = 3 + self.width = 256 + self.height = 256 + + # mean values in r, g, b channel. + self.means = (127, 127, 127) + self.scale = 0.0078125 + + self.display_iteration = 100 + self.milestones = [50, 80] + self.max_epoch = 100 + + self.net = 'stackedHGnet_v1' + self.nstack = 4 + + # ["adam", "sgd"] + self.optimizer = 'adam' + self.learn_rate = 0.1 + self.momentum = 0.01 # caffe: 0.99 + self.weight_decay = 0.0 + self.nesterov = False + self.scheduler = 'MultiStepLR' + self.gamma = 0.1 + + self.loss_weights = [1.0] + self.criterions = ['SoftmaxWithLoss'] + self.metrics = ['Accuracy'] + self.key_metric_index = 0 + self.classes_num = [1000] + self.label_num = len(self.classes_num) + + # model + self.ema = False + self.use_AAM = True + + # visualization + self.writer = None + + # log file + self.logger = None + + def init_instance(self): + # self.writer = SummaryWriter(logdir=self.log_dir, comment=self.type) + log_formatter = logging.Formatter( + '%(asctime)s %(levelname)-8s: %(message)s') + root_logger = logging.getLogger() + file_handler = logging.FileHandler(osp.join(self.log_dir, 'log.txt')) + file_handler.setFormatter(log_formatter) + file_handler.setLevel(logging.NOTSET) + root_logger.addHandler(file_handler) + console_handler = logging.StreamHandler() + console_handler.setFormatter(log_formatter) + console_handler.setLevel(logging.NOTSET) + root_logger.addHandler(console_handler) + root_logger.setLevel(logging.NOTSET) + self.logger = root_logger + + def __del__(self): + # tensorboard --logdir self.log_dir + if self.writer is not None: + # self.writer.export_scalars_to_json(self.log_dir + "visual.json") + self.writer.close() + + def init_from_args(self, args: Namespace): + args_vars = vars(args) + for key, value in args_vars.items(): + if hasattr(self, key) and value is not None: + setattr(self, key, value) diff --git a/modelscope/models/cv/facial_68ldk_detection/infer.py b/modelscope/models/cv/facial_68ldk_detection/infer.py new file mode 100644 index 000000000..ccc6229a0 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/infer.py @@ -0,0 +1,204 @@ +import argparse +import copy +import math + +import cv2 +import numpy as np +import torch + +# private package +from .lib import utility + + +class GetCropMatrix(): + """ + from_shape -> transform_matrix + """ + + def __init__(self, image_size, target_face_scale, align_corners=False): + self.image_size = image_size + self.target_face_scale = target_face_scale + self.align_corners = align_corners + + def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, + to_center): + cosv = math.cos(angle) + sinv = math.sin(angle) + + fx, fy = from_center + tx, ty = to_center + + acos = scale * cosv + asin = scale * sinv + + a0 = acos + a1 = -asin + a2 = tx - acos * fx + asin * fy + shift_xy[0] + + b0 = asin + b1 = acos + b2 = ty - asin * fx - acos * fy + shift_xy[1] + + rot_scale_m = np.array([[a0, a1, a2], [b0, b1, b2], [0.0, 0.0, 1.0]], + np.float32) + return rot_scale_m + + def process(self, scale, center_w, center_h): + if self.align_corners: + to_w, to_h = self.image_size - 1, self.image_size - 1 + else: + to_w, to_h = self.image_size, self.image_size + + rot_mu = 0 + scale_mu = self.image_size / (scale * self.target_face_scale * 200.0) + shift_xy_mu = (0, 0) + matrix = self._compose_rotate_and_scale( + rot_mu, + scale_mu, + shift_xy_mu, + from_center=[center_w, center_h], + to_center=[to_w / 2.0, to_h / 2.0]) + return matrix + + +class TransformPerspective(): + """ + image, matrix3x3 -> transformed_image + """ + + def __init__(self, image_size): + self.image_size = image_size + + def process(self, image, matrix): + return cv2.warpPerspective( + image, + matrix, + dsize=(self.image_size, self.image_size), + flags=cv2.INTER_LINEAR, + borderValue=0) + + +class TransformPoints2D(): + """ + points (nx2), matrix (3x3) -> points (nx2) + """ + + def process(self, srcPoints, matrix): + # nx3 + desPoints = np.concatenate( + [srcPoints, np.ones_like(srcPoints[:, [0]])], axis=1) + desPoints = desPoints @ np.transpose(matrix) # nx3 + desPoints = desPoints[:, :2] / desPoints[:, [2, 2]] + return desPoints.astype(srcPoints.dtype) + + +class Alignment: + + def __init__(self, args, model_path, dl_framework, device_ids): + self.input_size = 256 + self.target_face_scale = 1.0 + self.dl_framework = dl_framework + + # model + if self.dl_framework == 'pytorch': + # conf + self.config = utility.get_config(args) + self.config.device_id = device_ids[0] + + # set environment + utility.set_environment(self.config) + + net = utility.get_net(self.config) + if device_ids == [-1]: + checkpoint = torch.load(model_path, map_location='cpu') + else: + checkpoint = torch.load(model_path) + net.load_state_dict(checkpoint['net']) + + if self.config.device_id == -1: + net = net.cpu() + else: + net = net.to(self.config.device_id) + + net.eval() + self.alignment = net + else: + assert False + + self.getCropMatrix = GetCropMatrix( + image_size=self.input_size, + target_face_scale=self.target_face_scale, + align_corners=True) + self.transformPerspective = TransformPerspective( + image_size=self.input_size) + self.transformPoints2D = TransformPoints2D() + + def norm_points(self, points, align_corners=False): + if align_corners: + # [0, SIZE-1] -> [-1, +1] + return points / torch.tensor([ + self.input_size - 1, self.input_size - 1 + ]).to(points).view(1, 1, 2) * 2 - 1 + else: + # [-0.5, SIZE-0.5] -> [-1, +1] + return (points * 2 + 1) / torch.tensor([ + self.input_size, self.input_size + ]).to(points).view(1, 1, 2) - 1 + + def denorm_points(self, points, align_corners=False): + if align_corners: + # [-1, +1] -> [0, SIZE-1] + return (points + 1) / 2 * torch.tensor([ + self.input_size - 1, self.input_size - 1 + ]).to(points).view(1, 1, 2) + else: + # [-1, +1] -> [-0.5, SIZE-0.5] + return ((points + 1) * torch.tensor( # noqa + [self.input_size, self.input_size]).to(points).view(1, 1, + 2) # noqa + - 1) / 2 # noqa + + def preprocess(self, image, scale, center_w, center_h): + matrix = self.getCropMatrix.process(scale, center_w, center_h) + input_tensor = self.transformPerspective.process(image, matrix) + input_tensor = input_tensor[np.newaxis, :] + + input_tensor = torch.from_numpy(input_tensor) + input_tensor = input_tensor.float().permute(0, 3, 1, 2) + input_tensor = input_tensor / 255.0 * 2.0 - 1.0 + + if self.config.device_id == -1: + input_tensor = input_tensor.cpu() + else: + input_tensor = input_tensor.to(self.config.device_id) + + return input_tensor, matrix + + def postprocess(self, srcPoints, coeff): + # dstPoints = self.transformPoints2D.process(srcPoints, coeff) + # matrix^(-1) * src = dst + # src = matrix * dst + dstPoints = np.zeros(srcPoints.shape, dtype=np.float32) + for i in range(srcPoints.shape[0]): + dstPoints[i][0] = coeff[0][0] * srcPoints[i][0] + coeff[0][ + 1] * srcPoints[i][1] + coeff[0][2] + dstPoints[i][1] = coeff[1][0] * srcPoints[i][0] + coeff[1][ + 1] * srcPoints[i][1] + coeff[1][2] + return dstPoints + + def analyze(self, image, scale, center_w, center_h): + input_tensor, matrix = self.preprocess(image, scale, center_w, + center_h) + + if self.dl_framework == 'pytorch': + with torch.no_grad(): + output = self.alignment(input_tensor) + landmarks = output[-1][0] + else: + assert False + + landmarks = self.denorm_points(landmarks) + landmarks = landmarks.data.cpu().numpy()[0] + landmarks = self.postprocess(landmarks, np.linalg.inv(matrix)) + + return landmarks diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/__init__.py new file mode 100644 index 000000000..a0efc10d8 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/__init__.py @@ -0,0 +1,2 @@ +from .backbone import StackedHGNetV1 +from .utility import get_config, get_net diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/backbone/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/__init__.py new file mode 100644 index 000000000..5bbfc2e2c --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/__init__.py @@ -0,0 +1,5 @@ +from .stackedHGNetV1 import StackedHGNetV1 + +__all__ = [ + 'StackedHGNetV1', +] diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/backbone/core/coord_conv.py b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/core/coord_conv.py new file mode 100644 index 000000000..ca37ea557 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/core/coord_conv.py @@ -0,0 +1,187 @@ +import torch +import torch.nn as nn + + +class AddCoordsTh(nn.Module): + + def __init__(self, x_dim, y_dim, with_r=False, with_boundary=False): + super(AddCoordsTh, self).__init__() + self.x_dim = x_dim + self.y_dim = y_dim + self.with_r = with_r + self.with_boundary = with_boundary + + def forward(self, input_tensor, heatmap=None): + """ + input_tensor: (batch, c, x_dim, y_dim) + """ + batch_size_tensor = input_tensor.shape[0] + + xx_ones = torch.ones([1, self.y_dim], + dtype=torch.int32).to(input_tensor) + xx_ones = xx_ones.unsqueeze(-1) + + xx_range = torch.arange( + self.x_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor) + xx_range = xx_range.unsqueeze(1) + + xx_channel = torch.matmul(xx_ones.float(), xx_range.float()) + xx_channel = xx_channel.unsqueeze(-1) + + yy_ones = torch.ones([1, self.x_dim], + dtype=torch.int32).to(input_tensor) + yy_ones = yy_ones.unsqueeze(1) + + yy_range = torch.arange( + self.y_dim, dtype=torch.int32).unsqueeze(0).to(input_tensor) + yy_range = yy_range.unsqueeze(-1) + + yy_channel = torch.matmul(yy_range.float(), yy_ones.float()) + yy_channel = yy_channel.unsqueeze(-1) + + xx_channel = xx_channel.permute(0, 3, 2, 1) + yy_channel = yy_channel.permute(0, 3, 2, 1) + + xx_channel = xx_channel / (self.x_dim - 1) + yy_channel = yy_channel / (self.y_dim - 1) + + xx_channel = xx_channel * 2 - 1 + yy_channel = yy_channel * 2 - 1 + + xx_channel = xx_channel.repeat(batch_size_tensor, 1, 1, 1) + yy_channel = yy_channel.repeat(batch_size_tensor, 1, 1, 1) + + if self.with_boundary and heatmap is not None: + boundary_channel = torch.clamp(heatmap[:, -1:, :, :], 0.0, 1.0) + + zero_tensor = torch.zeros_like(xx_channel).to(xx_channel) + xx_boundary_channel = torch.where(boundary_channel > 0.05, + xx_channel, zero_tensor) + yy_boundary_channel = torch.where(boundary_channel > 0.05, + yy_channel, zero_tensor) + ret = torch.cat([input_tensor, xx_channel, yy_channel], dim=1) + + if self.with_r: + rr = torch.sqrt( + torch.pow(xx_channel, 2) + torch.pow(yy_channel, 2)) + rr = rr / torch.max(rr) + ret = torch.cat([ret, rr], dim=1) + + if self.with_boundary and heatmap is not None: + ret = torch.cat([ret, xx_boundary_channel, yy_boundary_channel], + dim=1) + return ret + + +class CoordConvTh(nn.Module): + """CoordConv layer as in the paper.""" + + def __init__(self, + x_dim, + y_dim, + with_r, + with_boundary, + in_channels, + out_channels, + first_one=False, + relu=False, + bn=False, + *args, + **kwargs): + super(CoordConvTh, self).__init__() + self.addcoords = AddCoordsTh( + x_dim=x_dim, + y_dim=y_dim, + with_r=with_r, + with_boundary=with_boundary) + in_channels += 2 + if with_r: + in_channels += 1 + if with_boundary and not first_one: + in_channels += 2 + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + *args, + **kwargs) + self.relu = nn.ReLU() if relu else None + self.bn = nn.BatchNorm2d(out_channels) if bn else None + + self.with_boundary = with_boundary + self.first_one = first_one + + def forward(self, input_tensor, heatmap=None): + assert (self.with_boundary and not self.first_one) == ( + heatmap is not None) + ret = self.addcoords(input_tensor, heatmap) + ret = self.conv(ret) + if self.bn is not None: + ret = self.bn(ret) + if self.relu is not None: + ret = self.relu(ret) + + return ret + + +''' +An alternative implementation for PyTorch with auto-infering the x-y dimensions. +''' + + +class AddCoords(nn.Module): + + def __init__(self, with_r=False): + super().__init__() + self.with_r = with_r + + def forward(self, input_tensor): + """ + Args: + input_tensor: shape(batch, channel, x_dim, y_dim) + """ + batch_size, _, x_dim, y_dim = input_tensor.size() + + xx_channel = torch.arange(x_dim).repeat(1, y_dim, 1).to(input_tensor) + yy_channel = torch.arange(y_dim).repeat(1, x_dim, 1).transpose( + 1, 2).to(input_tensor) + + xx_channel = xx_channel / (x_dim - 1) + yy_channel = yy_channel / (y_dim - 1) + + xx_channel = xx_channel * 2 - 1 + yy_channel = yy_channel * 2 - 1 + + xx_channel = xx_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3) + yy_channel = yy_channel.repeat(batch_size, 1, 1, 1).transpose(2, 3) + + ret = torch.cat( + [ # noqa + input_tensor, # noqa + xx_channel.type_as(input_tensor), # noqa + yy_channel.type_as(input_tensor) # noqa + ], # noqa + dim=1) # noqa + + if self.with_r: + rr = torch.sqrt( + torch.pow(xx_channel - 0.5, 2) + + torch.pow(yy_channel - 0.5, 2)) + ret = torch.cat([ret, rr], dim=1) + + return ret + + +class CoordConv(nn.Module): + + def __init__(self, in_channels, out_channels, with_r=False, **kwargs): + super().__init__() + self.addcoords = AddCoords(with_r=with_r) + in_channels += 2 + if with_r: + in_channels += 1 + self.conv = nn.Conv2d(in_channels, out_channels, **kwargs) + + def forward(self, x): + ret = self.addcoords(x) + ret = self.conv(ret) + return ret diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/backbone/stackedHGNetV1.py b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/stackedHGNetV1.py new file mode 100644 index 000000000..f330cc034 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/backbone/stackedHGNetV1.py @@ -0,0 +1,374 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..dataset import get_decoder +from .core.coord_conv import CoordConvTh + + +class Activation(nn.Module): + + def __init__(self, kind: str = 'relu', channel=None): + super().__init__() + self.kind = kind + + if '+' in kind: + norm_str, act_str = kind.split('+') + else: + norm_str, act_str = 'none', kind + + self.norm_fn = { + 'in': + F.instance_norm, + 'bn': + nn.BatchNorm2d(channel), + 'bn_noaffine': + nn.BatchNorm2d(channel, affine=False, track_running_stats=True), + 'none': + None + }[norm_str] + + self.act_fn = { + 'relu': F.relu, + 'softplus': nn.Softplus(), + 'exp': torch.exp, + 'sigmoid': torch.sigmoid, + 'tanh': torch.tanh, + 'none': None + }[act_str] + + self.channel = channel + + def forward(self, x): + if self.norm_fn is not None: + x = self.norm_fn(x) + if self.act_fn is not None: + x = self.act_fn(x) + return x + + def extra_repr(self): + return f'kind={self.kind}, channel={self.channel}' + + +class ConvBlock(nn.Module): + + def __init__(self, + inp_dim, + out_dim, + kernel_size=3, + stride=1, + bn=False, + relu=True, + groups=1): + super(ConvBlock, self).__init__() + self.inp_dim = inp_dim + self.conv = nn.Conv2d( + inp_dim, + out_dim, + kernel_size, + stride, + padding=(kernel_size - 1) // 2, + groups=groups, + bias=True) + self.relu = None + self.bn = None + if relu: + self.relu = nn.ReLU() + if bn: + self.bn = nn.BatchNorm2d(out_dim) + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + if self.relu is not None: + x = self.relu(x) + return x + + +class ResBlock(nn.Module): + + def __init__(self, inp_dim, out_dim, mid_dim=None): + super(ResBlock, self).__init__() + if mid_dim is None: + mid_dim = out_dim // 2 + self.relu = nn.ReLU() + self.bn1 = nn.BatchNorm2d(inp_dim) + self.conv1 = ConvBlock(inp_dim, mid_dim, 1, relu=False) + self.bn2 = nn.BatchNorm2d(mid_dim) + self.conv2 = ConvBlock(mid_dim, mid_dim, 3, relu=False) + self.bn3 = nn.BatchNorm2d(mid_dim) + self.conv3 = ConvBlock(mid_dim, out_dim, 1, relu=False) + self.skip_layer = ConvBlock(inp_dim, out_dim, 1, relu=False) + if inp_dim == out_dim: + self.need_skip = False + else: + self.need_skip = True + + def forward(self, x): + if self.need_skip: + residual = self.skip_layer(x) + else: + residual = x + out = self.bn1(x) + out = self.relu(out) + out = self.conv1(out) + out = self.bn2(out) + out = self.relu(out) + out = self.conv2(out) + out = self.bn3(out) + out = self.relu(out) + out = self.conv3(out) + out += residual + return out + + +class Hourglass(nn.Module): + + def __init__(self, + n, + f, + increase=0, + up_mode='nearest', + add_coord=False, + first_one=False, + x_dim=64, + y_dim=64): + super(Hourglass, self).__init__() + nf = f + increase + + Block = ResBlock + + if add_coord: + self.coordconv = CoordConvTh( + x_dim=x_dim, + y_dim=y_dim, + with_r=True, + with_boundary=True, + relu=False, + bn=False, + in_channels=f, + out_channels=f, + first_one=first_one, + kernel_size=1, + stride=1, + padding=0) + else: + self.coordconv = None + self.up1 = Block(f, f) + + # Lower branch + self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) + + self.low1 = Block(f, nf) + self.n = n + # Recursive hourglass + if self.n > 1: + self.low2 = Hourglass( + n=n - 1, + f=nf, + increase=increase, + up_mode=up_mode, + add_coord=False) + else: + self.low2 = Block(nf, nf) + self.low3 = Block(nf, f) + self.up2 = nn.Upsample(scale_factor=2, mode=up_mode) + + def forward(self, x, heatmap=None): + if self.coordconv is not None: + x = self.coordconv(x, heatmap) + up1 = self.up1(x) + pool1 = self.pool1(x) + low1 = self.low1(pool1) + low2 = self.low2(low1) + low3 = self.low3(low2) + up2 = self.up2(low3) + return up1 + up2 + + +class E2HTransform(nn.Module): + + def __init__(self, edge_info, num_points, num_edges): + super().__init__() + + e2h_matrix = np.zeros([num_points, num_edges]) + for edge_id, isclosed_indices in enumerate(edge_info): + is_closed, indices = isclosed_indices + for point_id in indices: + e2h_matrix[point_id, edge_id] = 1 + e2h_matrix = torch.from_numpy(e2h_matrix).float() + + # pn x en x 1 x 1. + self.register_buffer( + 'weight', + e2h_matrix.view(e2h_matrix.size(0), e2h_matrix.size(1), 1, 1)) + + # some keypoints are not coverred by any edges, + # in these cases, we must add a constant bias to their heatmap weights. + bias = ((e2h_matrix @ torch.ones(e2h_matrix.size(1)).to(e2h_matrix)) + < # noqa + 0.5).to(e2h_matrix) # noqa + # pn x 1. + self.register_buffer('bias', bias) + + def forward(self, edgemaps): + # input: batch_size x en x hw x hh. + # output: batch_size x pn x hw x hh. + return F.conv2d(edgemaps, weight=self.weight, bias=self.bias) + + +class StackedHGNetV1(nn.Module): + + def __init__(self, + config, + classes_num, + edge_info, + nstack=4, + nlevels=4, + in_channel=256, + increase=0, + add_coord=True, + decoder_type='default'): + super(StackedHGNetV1, self).__init__() + + self.cfg = config + self.coder_type = decoder_type + self.decoder = get_decoder(decoder_type=decoder_type) + self.nstack = nstack + self.add_coord = add_coord + + self.num_heats = classes_num[0] + + if self.add_coord: + convBlock = CoordConvTh( + x_dim=self.cfg.width, + y_dim=self.cfg.height, + with_r=True, + with_boundary=False, + relu=True, + bn=True, + in_channels=3, + out_channels=64, + kernel_size=7, + stride=2, + padding=3) + else: + convBlock = ConvBlock(3, 64, 7, 2, bn=True, relu=True) + + pool = nn.MaxPool2d(kernel_size=2, stride=2) + + Block = ResBlock + + self.pre = nn.Sequential(convBlock, Block(64, 128), pool, + Block(128, 128), Block(128, in_channel)) + + self.hgs = nn.ModuleList([ + Hourglass( + n=nlevels, + f=in_channel, + increase=increase, + add_coord=self.add_coord, + first_one=(_ == 0), + x_dim=int(self.cfg.width / self.nstack), + y_dim=int(self.cfg.height / self.nstack)) + for _ in range(nstack) + ]) + + self.features = nn.ModuleList([ + nn.Sequential( + Block(in_channel, in_channel), + ConvBlock(in_channel, in_channel, 1, bn=True, relu=True)) + for _ in range(nstack) + ]) + + self.out_heatmaps = nn.ModuleList([ + ConvBlock(in_channel, self.num_heats, 1, relu=False, bn=False) + for _ in range(nstack) + ]) + + if self.cfg.use_AAM: + self.num_edges = classes_num[1] + self.num_points = classes_num[2] + + self.e2h_transform = E2HTransform(edge_info, self.num_points, + self.num_edges) + self.out_edgemaps = nn.ModuleList([ + ConvBlock(in_channel, self.num_edges, 1, relu=False, bn=False) + for _ in range(nstack) + ]) + self.out_pointmaps = nn.ModuleList([ + ConvBlock( + in_channel, self.num_points, 1, relu=False, bn=False) + for _ in range(nstack) + ]) + self.merge_edgemaps = nn.ModuleList([ + ConvBlock(self.num_edges, in_channel, 1, relu=False, bn=False) + for _ in range(nstack - 1) + ]) + self.merge_pointmaps = nn.ModuleList([ + ConvBlock( + self.num_points, in_channel, 1, relu=False, bn=False) + for _ in range(nstack - 1) + ]) + self.edgemap_act = Activation('sigmoid', self.num_edges) + self.pointmap_act = Activation('sigmoid', self.num_points) + + self.merge_features = nn.ModuleList([ + ConvBlock(in_channel, in_channel, 1, relu=False, bn=False) + for _ in range(nstack - 1) + ]) + self.merge_heatmaps = nn.ModuleList([ + ConvBlock(self.num_heats, in_channel, 1, relu=False, bn=False) + for _ in range(nstack - 1) + ]) + + self.nstack = nstack + + self.heatmap_act = Activation('in+relu', self.num_heats) + + self.inference = False + + def set_inference(self, inference): + self.inference = inference + + def forward(self, x): + x = self.pre(x) + + y, fusionmaps = [], [] + heatmaps = None + for i in range(self.nstack): + hg = self.hgs[i](x, heatmap=heatmaps) + feature = self.features[i](hg) + + heatmaps0 = self.out_heatmaps[i](feature) + heatmaps = self.heatmap_act(heatmaps0) + + if self.cfg.use_AAM: + pointmaps0 = self.out_pointmaps[i](feature) + pointmaps = self.pointmap_act(pointmaps0) + edgemaps0 = self.out_edgemaps[i](feature) + edgemaps = self.edgemap_act(edgemaps0) + mask = self.e2h_transform(edgemaps) * pointmaps + fusion_heatmaps = mask * heatmaps + else: + fusion_heatmaps = heatmaps + + landmarks = self.decoder.get_coords_from_heatmap(fusion_heatmaps) + + if i < self.nstack - 1: + x = x + self.merge_features[i](feature) + \ + self.merge_heatmaps[i](heatmaps) + if self.cfg.use_AAM: + x += self.merge_pointmaps[i](pointmaps) + x += self.merge_edgemaps[i](edgemaps) + + y.append(landmarks) + if self.cfg.use_AAM: + y.append(pointmaps) + y.append(edgemaps) + + fusionmaps.append(fusion_heatmaps) + + return y, fusionmaps, landmarks diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/__init__.py new file mode 100644 index 000000000..bede64a74 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/__init__.py @@ -0,0 +1,5 @@ +from .alignmentDataset import AlignmentDataset +from .decoder import get_decoder +from .encoder import get_encoder + +__all__ = ['Augmentation', 'AlignmentDataset', 'get_encoder', 'get_decoder'] diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/alignmentDataset.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/alignmentDataset.py new file mode 100644 index 000000000..d0105489a --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/alignmentDataset.py @@ -0,0 +1,360 @@ +import copy +import hashlib +import math +import os +import sys + +import cv2 +import imageio +import numpy as np +import pandas as pd +import torch +import torch.nn.functional as F +from PIL import Image, ImageEnhance, ImageFile +from scipy import interpolate +from torch.utils.data import Dataset + +from .encoder import get_encoder + +ImageFile.LOAD_TRUNCATED_IMAGES = True + + +class AlignmentDataset(Dataset): + + def __init__( + self, + tsv_flie, + image_dir='', + transform=None, + width=256, + height=256, + channels=3, + means=(127.5, 127.5, 127.5), + scale=1 / 127.5, + classes_num=None, + crop_op=True, + aug_prob=0.0, + edge_info=None, + flip_mapping=None, + is_train=True, + encoder_type='default', + ): + super(AlignmentDataset, self).__init__() + self.use_AAM = True + self.encoder_type = encoder_type + self.encoder = get_encoder(height, width, encoder_type=encoder_type) + self.items = pd.read_csv(tsv_flie, sep='\t') + self.image_dir = image_dir + self.landmark_num = classes_num[0] + self.transform = transform + + self.image_width = width + self.image_height = height + self.channels = channels + assert self.image_width == self.image_height + + self.means = means + self.scale = scale + + self.aug_prob = aug_prob + self.edge_info = edge_info + self.is_train = is_train + std_lmk_5pts = np.array([ + 196.0, 226.0, 316.0, 226.0, 256.0, 286.0, 220.0, 360.4, 292.0, + 360.4 + ], np.float32) / 256.0 - 1.0 + std_lmk_5pts = np.reshape(std_lmk_5pts, (5, 2)) # [-1 1] + target_face_scale = 1.0 if crop_op else 1.25 + + self.augmentation = Augmentation( + is_train=self.is_train, + aug_prob=self.aug_prob, + image_size=self.image_width, + crop_op=crop_op, + std_lmk_5pts=std_lmk_5pts, + target_face_scale=target_face_scale, + flip_rate=0.5, + flip_mapping=flip_mapping, + random_shift_sigma=0.05, + random_rot_sigma=math.pi / 180 * 18, + random_scale_sigma=0.1, + random_gray_rate=0.2, + random_occ_rate=0.4, + random_blur_rate=0.3, + random_gamma_rate=0.2, + random_nose_fusion_rate=0.2) + + def _circle(self, img, pt, sigma=1.0, label_type='Gaussian'): + # Check that any part of the gaussian is in-bounds + tmp_size = sigma * 3 + ul = [int(pt[0] - tmp_size), int(pt[1] - tmp_size)] + br = [int(pt[0] + tmp_size + 1), int(pt[1] + tmp_size + 1)] + if (ul[0] > img.shape[1] - 1 or ul[1] > img.shape[0] - 1 + or br[0] - 1 < 0 or br[1] - 1 < 0): + # If not, just return the image as is + return img + + # Generate gaussian + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, np.newaxis] + x0 = y0 = size // 2 + # The gaussian is not normalized, we want the center value to equal 1 + if label_type == 'Gaussian': + g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + else: + g = sigma / (((x - x0)**2 + (y - y0)**2 + sigma**2)**1.5) + + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0] + g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], img.shape[1]) + img_y = max(0, ul[1]), min(br[1], img.shape[0]) + + img[img_y[0]:img_y[1], + img_x[0]:img_x[1]] = 255 * g[g_y[0]:g_y[1], g_x[0]:g_x[1]] + return img + + def _polylines(self, + img, + lmks, + is_closed, + color=255, + thickness=1, + draw_mode=cv2.LINE_AA, + interpolate_mode=cv2.INTER_AREA, + scale=4): + h, w = img.shape + img_scale = cv2.resize( + img, (w * scale, h * scale), interpolation=interpolate_mode) + lmks_scale = (lmks * scale + 0.5).astype(np.int32) + cv2.polylines(img_scale, [lmks_scale], is_closed, color, + thickness * scale, draw_mode) + img = cv2.resize(img_scale, (w, h), interpolation=interpolate_mode) + return img + + def _generate_edgemap(self, points, scale=0.25, thickness=1): + h, w = self.image_height, self.image_width + edgemaps = [] + for is_closed, indices in self.edge_info: + edgemap = np.zeros([h, w], dtype=np.float32) + # align_corners: False. + part = copy.deepcopy(points[np.array(indices)]) + + part = self._fit_curve(part, is_closed) + part[:, 0] = np.clip(part[:, 0], 0, w - 1) + part[:, 1] = np.clip(part[:, 1], 0, h - 1) + edgemap = self._polylines(edgemap, part, is_closed, 255, thickness) + + edgemaps.append(edgemap) + edgemaps = np.stack(edgemaps, axis=0) / 255.0 + edgemaps = torch.from_numpy(edgemaps).float().unsqueeze(0) + edgemaps = F.interpolate( + edgemaps, + size=(int(w * scale), int(h * scale)), + mode='bilinear', + align_corners=False).squeeze() + return edgemaps + + def _fit_curve(self, lmks, is_closed=False, density=5): + try: + x = lmks[:, 0].copy() + y = lmks[:, 1].copy() + if is_closed: + x = np.append(x, x[0]) + y = np.append(y, y[0]) + tck, u = interpolate.splprep([x, y], s=0, per=is_closed, k=3) + # bins = (x.shape[0] - 1) * density + 1 + # lmk_x, lmk_y = interpolate.splev(np.linspace(0, 1, bins), f) + intervals = np.array([]) + for i in range(len(u) - 1): + intervals = np.concatenate( + (intervals, + np.linspace(u[i], u[i + 1], density, endpoint=False))) + if not is_closed: + intervals = np.concatenate((intervals, [u[-1]])) + lmk_x, lmk_y = interpolate.splev(intervals, tck, der=0) + # der_x, der_y = interpolate.splev(intervals, tck, der=1) + curve_lmks = np.stack([lmk_x, lmk_y], axis=-1) + # curve_ders = np.stack([der_x, der_y], axis=-1) + # origin_indices = np.arange(0, curve_lmks.shape[0], density) + + return curve_lmks + except Exception: + return lmks + + def _image_id(self, image_path): + if not os.path.exists(image_path): + image_path = os.path.join(self.image_dir, image_path) + return hashlib.md5(open(image_path, 'rb').read()).hexdigest() + + def _load_image(self, image_path): + if not os.path.exists(image_path): + image_path = os.path.join(self.image_dir, image_path) + + try: + # img = cv2.imdecode(np.fromfile(image_path, dtype=np.uint8), cv2.IMREAD_COLOR)#HWC, BGR, [0-255] + img = cv2.imread(image_path, cv2.IMREAD_COLOR) # HWC, BGR, [0-255] + assert img is not None and len( + img.shape) == 3 and img.shape[2] == 3 + except Exception: + try: + img = imageio.imread(image_path) # HWC, RGB, [0-255] + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) # HWC, BGR, [0-255] + assert img is not None and len( + img.shape) == 3 and img.shape[2] == 3 + except Exception: + try: + gifImg = imageio.mimread(image_path) # BHWC, RGB, [0-255] + img = gifImg[0] # HWC, RGB, [0-255] + img = cv2.cvtColor(img, + cv2.COLOR_RGB2BGR) # HWC, BGR, [0-255] + assert img is not None and len( + img.shape) == 3 and img.shape[2] == 3 + except Exception: + img = None + return img + + def _compose_rotate_and_scale(self, angle, scale, shift_xy, from_center, + to_center): + cosv = math.cos(angle) + sinv = math.sin(angle) + + fx, fy = from_center + tx, ty = to_center + + acos = scale * cosv + asin = scale * sinv + + a0 = acos + a1 = -asin + a2 = tx - acos * fx + asin * fy + shift_xy[0] + + b0 = asin + b1 = acos + b2 = ty - asin * fx - acos * fy + shift_xy[1] + + rot_scale_m = np.array([[a0, a1, a2], [b0, b1, b2], [0.0, 0.0, 1.0]], + np.float32) + return rot_scale_m + + def _transformPoints2D(self, points, matrix): + """ + points (nx2), matrix (3x3) -> points (nx2) + """ + dtype = points.dtype + + # nx3 + points = np.concatenate([points, np.ones_like(points[:, [0]])], axis=1) + points = points @ np.transpose(matrix) # nx3 + points = points[:, :2] / points[:, [2, 2]] + return points.astype(dtype) + + def _transformPerspective(self, image, matrix, target_shape): + """ + image, matrix3x3 -> transformed_image + """ + return cv2.warpPerspective( + image, + matrix, + dsize=(target_shape[1], target_shape[0]), + flags=cv2.INTER_LINEAR, + borderValue=0) + + def _norm_points(self, points, h, w, align_corners=False): + if align_corners: + # [0, SIZE-1] -> [-1, +1] + des_points = points / torch.tensor([w - 1, h - 1]).to(points).view( + 1, 2) * 2 - 1 + else: + # [-0.5, SIZE-0.5] -> [-1, +1] + des_points = (points * 2 + 1) / torch.tensor( + [w, h]).to(points).view(1, 2) - 1 + des_points = torch.clamp(des_points, -1, 1) + return des_points + + def _denorm_points(self, points, h, w, align_corners=False): + if align_corners: + # [-1, +1] -> [0, SIZE-1] + des_points = (points + 1) / 2 * torch.tensor( + [w - 1, h - 1]).to(points).view(1, 1, 2) + else: + # [-1, +1] -> [-0.5, SIZE-0.5] + des_points = ( + (points + 1) * torch.tensor([w, h]).to(points).view(1, 1, 2) + - 1) / 2 + return des_points + + def __len__(self): + return len(self.items) + + def __getitem__(self, index): + sample = dict() + + image_path = self.items.iloc[index, 0] + landmarks_5pts = self.items.iloc[index, 1] + landmarks_5pts = np.array( + list(map(float, landmarks_5pts.split(','))), + dtype=np.float32).reshape(5, 2) + landmarks_target = self.items.iloc[index, 2] + landmarks_target = np.array( + list(map(float, landmarks_target.split(','))), + dtype=np.float32).reshape(self.landmark_num, 2) + scale = float(self.items.iloc[index, 3]) + center_w, center_h = float(self.items.iloc[index, 4]), float( + self.items.iloc[index, 5]) + if len(self.items.iloc[index]) > 6: + tags = np.array( + list( + map(lambda x: int(float(x)), + self.items.iloc[index, 6].split(',')))) + else: + tags = np.array([]) + + # image & keypoints alignment + image_path = image_path.replace('\\', '/') + # wflw testset + image_path = image_path.replace( + '//msr-facestore/Workspace/MSRA_EP_Allergan/users/yanghuan/training_data/wflw/rawImages/', + '') + # trainset + image_path = image_path.replace('./rawImages/', '') + image_path = os.path.join(self.image_dir, image_path) + + # image path + sample['image_path'] = image_path + + img = self._load_image(image_path) # HWC, BGR, [0, 255] + assert img is not None + + # augmentation + # landmarks_target = [-0.5, edge-0.5] + img, landmarks_target, matrix = \ + self.augmentation.process(img, landmarks_target, landmarks_5pts, scale, center_w, center_h) + + landmarks = self._norm_points( + torch.from_numpy(landmarks_target), self.image_height, + self.image_width) + + sample['label'] = [ + landmarks, + ] + + if self.use_AAM: + pointmap = self.encoder.generate_heatmap(landmarks_target) + edgemap = self._generate_edgemap(landmarks_target) + sample['label'] += [pointmap, edgemap] + + sample['matrix'] = matrix + + # image normalization + img = img.transpose(2, 0, 1).astype(np.float32) # CHW, BGR, [0, 255] + img[0, :, :] = (img[0, :, :] - self.means[0]) * self.scale + img[1, :, :] = (img[1, :, :] - self.means[1]) * self.scale + img[2, :, :] = (img[2, :, :] - self.means[2]) * self.scale + sample['data'] = torch.from_numpy(img) # CHW, BGR, [-1, 1] + + sample['tags'] = tags + + return sample diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/__init__.py new file mode 100644 index 000000000..9acc9bcb5 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/__init__.py @@ -0,0 +1,9 @@ +from .decoder_default import decoder_default + + +def get_decoder(decoder_type='default'): + if decoder_type == 'default': + decoder = decoder_default() + else: + raise NotImplementedError + return decoder diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/decoder_default.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/decoder_default.py new file mode 100644 index 000000000..4e1c7c70c --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/decoder/decoder_default.py @@ -0,0 +1,39 @@ +import torch + + +class decoder_default: + + def __init__(self, weight=1, use_weight_map=False): + self.weight = weight + self.use_weight_map = use_weight_map + + def _make_grid(self, h, w): + yy, xx = torch.meshgrid( + torch.arange(h).float() / (h - 1) * 2 - 1, + torch.arange(w).float() / (w - 1) * 2 - 1) + return yy, xx + + def get_coords_from_heatmap(self, heatmap): + """ + inputs: + - heatmap: batch x npoints x h x w + + outputs: + - coords: batch x npoints x 2 (x,y), [-1, +1] + - radius_sq: batch x npoints + """ + batch, npoints, h, w = heatmap.shape + if self.use_weight_map: + heatmap = heatmap * self.weight + + yy, xx = self._make_grid(h, w) + yy = yy.view(1, 1, h, w).to(heatmap) + xx = xx.view(1, 1, h, w).to(heatmap) + + heatmap_sum = torch.clamp(heatmap.sum([2, 3]), min=1e-6) + + yy_coord = (yy * heatmap).sum([2, 3]) / heatmap_sum # batch x npoints + xx_coord = (xx * heatmap).sum([2, 3]) / heatmap_sum # batch x npoints + coords = torch.stack([xx_coord, yy_coord], dim=-1) + + return coords diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/__init__.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/__init__.py new file mode 100644 index 000000000..60af50821 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/__init__.py @@ -0,0 +1,13 @@ +from .encoder_default import encoder_default + + +def get_encoder(image_height, + image_width, + scale=0.25, + sigma=1.5, + encoder_type='default'): + if encoder_type == 'default': + encoder = encoder_default(image_height, image_width, scale, sigma) + else: + raise NotImplementedError + return encoder diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/encoder_default.py b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/encoder_default.py new file mode 100644 index 000000000..8bff79421 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/dataset/encoder/encoder_default.py @@ -0,0 +1,68 @@ +import copy + +import numpy as np +import torch +import torch.nn.functional as F + + +class encoder_default: + + def __init__(self, image_height, image_width, scale=0.25, sigma=1.5): + self.image_height = image_height + self.image_width = image_width + self.scale = scale + self.sigma = sigma + + def generate_heatmap(self, points): + # points = (num_pts, 2) + h, w = self.image_height, self.image_width + pointmaps = [] + for i in range(len(points)): + pointmap = np.zeros([h, w], dtype=np.float32) + # align_corners: False. + point = copy.deepcopy(points[i]) + point[0] = max(0, min(w - 1, point[0])) + point[1] = max(0, min(h - 1, point[1])) + pointmap = self._circle(pointmap, point, sigma=self.sigma) + + pointmaps.append(pointmap) + pointmaps = np.stack(pointmaps, axis=0) / 255.0 + pointmaps = torch.from_numpy(pointmaps).float().unsqueeze(0) + pointmaps = F.interpolate( + pointmaps, + size=(int(w * self.scale), int(h * self.scale)), + mode='bilinear', + align_corners=False).squeeze() + return pointmaps + + def _circle(self, img, pt, sigma=1.0, label_type='Gaussian'): + # Check that any part of the gaussian is in-bounds + tmp_size = sigma * 3 + ul = [int(pt[0] - tmp_size), int(pt[1] - tmp_size)] + br = [int(pt[0] + tmp_size + 1), int(pt[1] + tmp_size + 1)] + if (ul[0] > img.shape[1] - 1 or ul[1] > img.shape[0] - 1 + or br[0] - 1 < 0 or br[1] - 1 < 0): + # If not, just return the image as is + return img + + # Generate gaussian + size = 2 * tmp_size + 1 + x = np.arange(0, size, 1, np.float32) + y = x[:, np.newaxis] + x0 = y0 = size // 2 + # The gaussian is not normalized, we want the center value to equal 1 + if label_type == 'Gaussian': + g = np.exp(-((x - x0)**2 + (y - y0)**2) / (2 * sigma**2)) + else: + g = sigma / (((x - x0)**2 + (y - y0)**2 + sigma**2)**1.5) + + # Usable gaussian range + g_x = max(0, -ul[0]), min(br[0], img.shape[1]) - ul[0] + g_y = max(0, -ul[1]), min(br[1], img.shape[0]) - ul[1] + # Image range + img_x = max(0, ul[0]), min(br[0], img.shape[1]) + img_y = max(0, ul[1]), min(br[1], img.shape[0]) + + img[img_y[0]:img_y[1], + img_x[0]:img_x[1]] = 255 * g[g_y[0]:g_y[1], g_x[0]:g_x[1]] + return img diff --git a/modelscope/models/cv/facial_68ldk_detection/lib/utility.py b/modelscope/models/cv/facial_68ldk_detection/lib/utility.py new file mode 100644 index 000000000..2e195761b --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/lib/utility.py @@ -0,0 +1,54 @@ +import os.path as osp +import time + +import json +import numpy as np +import torch + +from ..conf import * +from .backbone import StackedHGNetV1 + + +def get_config(args): + config = None + config_name = args.config_name + if config_name == 'alignment': + config = Alignment(args) + else: + assert NotImplementedError + + return config + + +def get_net(config): + net = None + if config.net == 'stackedHGnet_v1': + net = StackedHGNetV1( + config=config, + classes_num=config.classes_num, + edge_info=config.edge_info, + nstack=config.nstack, + add_coord=config.add_coord, + decoder_type=config.decoder_type) + else: + assert False + return net + + +def set_environment(config): + if config.device_id >= 0: + assert torch.cuda.is_available( + ) and torch.cuda.device_count() > config.device_id + torch.cuda.empty_cache() + config.device = torch.device('cuda', config.device_id) + config.use_gpu = True + else: + config.device = torch.device('cpu') + config.use_gpu = False + + torch.set_default_dtype(torch.float32) + torch.set_default_tensor_type(torch.FloatTensor) + torch.set_flush_denormal(True) # ignore extremely small value + torch.backends.cudnn.benchmark = True + # This flag allows you to enable the inbuilt cudnn auto-tuner to find the best algorithm to use for your hardware. + torch.autograd.set_detect_anomaly(True) diff --git a/modelscope/models/cv/facial_68ldk_detection/star_model.py b/modelscope/models/cv/facial_68ldk_detection/star_model.py new file mode 100644 index 000000000..c0d37ef10 --- /dev/null +++ b/modelscope/models/cv/facial_68ldk_detection/star_model.py @@ -0,0 +1,35 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.facial_68ldk_detection import infer +from modelscope.outputs import OutputKeys +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@MODELS.register_module( + Tasks.facial_68ldk_detection, module_name=Models.star_68ldk_detection) +class FaceLandmarkDetection(TorchModel): + + def __init__(self, model_dir, *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + + def forward(self, Inputs): + return Inputs + + def postprocess(self, Inputs): + return Inputs + + def inference(self, data): + return data diff --git a/modelscope/models/cv/head_reconstruction/models/headrecon_model.py b/modelscope/models/cv/head_reconstruction/models/headrecon_model.py index e515421c1..a3d5cb6f9 100644 --- a/modelscope/models/cv/head_reconstruction/models/headrecon_model.py +++ b/modelscope/models/cv/head_reconstruction/models/headrecon_model.py @@ -109,7 +109,7 @@ def __init__(self, model_dir, *args, **kwargs): ] self.compute_feat_loss = perceptual_loss - self.comupte_color_loss = photo_loss + self.compute_color_loss = photo_loss self.compute_lm_loss = landmark_loss self.compute_reg_loss = reg_loss self.compute_reflc_loss = reflectance_loss @@ -519,7 +519,7 @@ def get_edge_points_horizontal(self): def compute_losses_fitting(self): face_mask = self.pred_mask face_mask = face_mask.detach() - self.loss_color = self.opt.w_color * self.comupte_color_loss( + self.loss_color = self.opt.w_color * self.compute_color_loss( self.pred_face, self.input_img, face_mask) # 1.0 loss_reg, loss_gamma = self.compute_reg_loss( @@ -552,7 +552,7 @@ def compute_losses_fitting(self): head_mask = self.pred_mask_head head_mask = head_mask.detach() - self.loss_color_head = self.opt.w_color * self.comupte_color_loss( + self.loss_color_head = self.opt.w_color * self.compute_color_loss( self.pred_head, self.input_img, head_mask) # 1.0 self.loss_smooth_offset_head = TVLoss()( self.shape_offset_uv_head.permute(0, 3, 1, 2)) * 100 # 10000 diff --git a/modelscope/models/cv/head_reconstruction/models/losses.py b/modelscope/models/cv/head_reconstruction/models/losses.py index 6d4af4e8d..e170112d9 100644 --- a/modelscope/models/cv/head_reconstruction/models/losses.py +++ b/modelscope/models/cv/head_reconstruction/models/losses.py @@ -49,7 +49,7 @@ def perceptual_loss(id_featureA, id_featureB): # image level loss def photo_loss(imageA, imageB, mask, eps=1e-6): """ - l2 norm (with sqrt, to ensure backward stabililty, use eps, otherwise Nan may occur) + l2 norm (with sqrt, to ensure backward stability, use eps, otherwise Nan may occur) Parameters: imageA --torch.tensor (B, 3, H, W), range (0, 1), RGB order imageB --same as imageA diff --git a/modelscope/models/cv/human3d_animation/generate_skeleton.py b/modelscope/models/cv/human3d_animation/generate_skeleton.py index 556cdbd37..6543c8485 100644 --- a/modelscope/models/cv/human3d_animation/generate_skeleton.py +++ b/modelscope/models/cv/human3d_animation/generate_skeleton.py @@ -9,7 +9,7 @@ from .utils import matrix_to_axis_angle, rotation_6d_to_matrix -def laod_smpl_params(pose_fname): +def load_smpl_params(pose_fname): with open(pose_fname, 'rb') as f: data = pickle.load(f) pose = torch.from_numpy(data['pose']) @@ -132,7 +132,7 @@ def gen_skeleton_bvh(model_dir, action_dir, case_dir, action, mode='move'): device = torch.device('cpu') assets_dir = os.path.join(model_dir, '3D-assets') pkl_path = os.path.join(assets_dir, 'smpl.pkl') - poses, shapes, trans, joints = laod_smpl_params(pkl_path) + poses, shapes, trans, joints = load_smpl_params(pkl_path) if action.endswith('.npy'): skeleton_path = os.path.join(assets_dir, 'skeleton_nohand.npy') else: diff --git a/modelscope/models/cv/human_image_generation/human_image_generation_infer.py b/modelscope/models/cv/human_image_generation/human_image_generation_infer.py index 0781d8930..420ce786a 100644 --- a/modelscope/models/cv/human_image_generation/human_image_generation_infer.py +++ b/modelscope/models/cv/human_image_generation/human_image_generation_infer.py @@ -148,7 +148,7 @@ def forward(self, x, y, z): return pred_result -def trans_keypoins(keypoints, param, img_size, offset=None): +def trans_keypoints(keypoints, param, img_size, offset=None): missing_keypoint_index = keypoints == -1 # crop the white line in the original dataset @@ -194,7 +194,7 @@ def get_label_tensor(path, img, param): [255, 0, 170], [255, 0, 85]] canvas = np.zeros((img.shape[1], img.shape[2], 3)).astype(np.uint8) keypoint = np.loadtxt(path) - keypoint, normalized_kp = trans_keypoins(keypoint, param, img.shape[1:]) + keypoint, normalized_kp = trans_keypoints(keypoint, param, img.shape[1:]) stickwidth = 4 for i in range(18): x, y = keypoint[i, 0:2] diff --git a/modelscope/models/cv/human_normal_estimation/__init__.py b/modelscope/models/cv/human_normal_estimation/__init__.py new file mode 100644 index 000000000..f176c6bfc --- /dev/null +++ b/modelscope/models/cv/human_normal_estimation/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .human_nnet import HumanNormalEstimation + +else: + _import_structure = { + 'human_nnet': ['HumanNormalEstimation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/human_normal_estimation/human_nnet.py b/modelscope/models/cv/human_normal_estimation/human_nnet.py new file mode 100644 index 000000000..6621c8d3d --- /dev/null +++ b/modelscope/models/cv/human_normal_estimation/human_nnet.py @@ -0,0 +1,80 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os + +import numpy as np +import torch +import torchvision.transforms as T + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.human_normal_estimation.networks import config, nnet +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import Tasks + + +@MODELS.register_module( + Tasks.human_normal_estimation, module_name=Models.human_normal_estimation) +class HumanNormalEstimation(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + super().__init__(model_dir, **kwargs) + config_file = os.path.join(model_dir, 'config.txt') + args = config.get_args(txt_file=config_file) + args.encoder_path = os.path.join(model_dir, args.encoder_path) + + self.device = torch.device( + 'cuda:0') if torch.cuda.is_available() else torch.device('cpu') + self.nnet = nnet.NormalNet(args=args).to(self.device) + self.nnet_path = os.path.join(model_dir, 'ckpt/best_nnet.pt') + if os.path.exists(self.nnet_path): + ckpt = torch.load( + self.nnet_path, map_location=self.device)['model'] + load_dict = {} + for k, v in ckpt.items(): + if k.startswith('module.'): + k_ = k.replace('module.', '') + load_dict[k_] = v + else: + load_dict[k] = v + self.nnet.load_state_dict(load_dict) + self.nnet.eval() + + self.normalize = T.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) + + def forward(self, inputs): + img = inputs['img'].astype(np.float32) / 255.0 + msk = inputs['msk'].astype(np.float32) / 255.0 + bbox = inputs['bbox'] + + img_h, img_w = img.shape[0:2] + img = torch.from_numpy(img).permute(2, 0, + 1).unsqueeze(0).to(self.device) + img = self.normalize(img) + + fx = fy = (max(img_h, img_h) / 2.0) / np.tan(np.deg2rad(60.0 / 2.0)) + cx = (img_h / 2.0) - 0.5 + cy = (img_w / 2.0) - 0.5 + + intrins = torch.tensor( + [[fx, 0, cx + 0.5], [0, fy, cy + 0.5], [0, 0, 1]], + dtype=torch.float32, + device=self.device).unsqueeze(0) + + pred_norm = self.nnet(img, intrins=intrins)[-1] + pred_norm = pred_norm.detach().cpu().permute(0, 2, 3, 1).numpy() + pred_norm = pred_norm[0, ...] + pred_norm = pred_norm * msk[..., None] + pred_norm = pred_norm[bbox[1]:bbox[3], bbox[0]:bbox[2]] + results = pred_norm + return results + + def postprocess(self, inputs): + normal_result = inputs + results = {OutputKeys.NORMALS: normal_result} + return results + + def inference(self, data): + results = self.forward(data) + return results diff --git a/modelscope/models/cv/human_normal_estimation/networks/__init__.py b/modelscope/models/cv/human_normal_estimation/networks/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/human_normal_estimation/networks/config.py b/modelscope/models/cv/human_normal_estimation/networks/config.py new file mode 100644 index 000000000..1a4883091 --- /dev/null +++ b/modelscope/models/cv/human_normal_estimation/networks/config.py @@ -0,0 +1,40 @@ +import argparse + + +def convert_arg_line_to_args(arg_line): + for arg in arg_line.split(): + if not arg.strip(): + continue + yield str(arg) + + +def get_args(txt_file=None): + parser = argparse.ArgumentParser( + fromfile_prefix_chars='@', conflict_handler='resolve') + parser.convert_arg_line_to_args = convert_arg_line_to_args + + # checkpoint (only needed when testing the model) + parser.add_argument('--ckpt_path', type=str, default=None) + parser.add_argument('--encoder_path', type=str, default=None) + + # ↓↓↓↓ + # NOTE: project-specific args + parser.add_argument('--output_dim', type=int, default=3, help='{3, 4}') + parser.add_argument('--output_type', type=str, default='R', help='{R, G}') + parser.add_argument('--feature_dim', type=int, default=64) + parser.add_argument('--hidden_dim', type=int, default=64) + + parser.add_argument('--encoder_B', type=int, default=5) + + parser.add_argument('--decoder_NF', type=int, default=2048) + parser.add_argument('--decoder_BN', default=False, action='store_true') + parser.add_argument('--decoder_down', type=int, default=2) + parser.add_argument( + '--learned_upsampling', default=False, action='store_true') + + # read arguments from txt file + if txt_file: + config_filename = '@' + txt_file + + args = parser.parse_args([config_filename]) + return args diff --git a/modelscope/models/cv/human_normal_estimation/networks/nnet.py b/modelscope/models/cv/human_normal_estimation/networks/nnet.py new file mode 100644 index 000000000..e10e97c90 --- /dev/null +++ b/modelscope/models/cv/human_normal_estimation/networks/nnet.py @@ -0,0 +1,125 @@ +import os +import sys + +import torch +import torch.nn as nn + +from .submodules import (Encoder, UpSampleBN, UpSampleGN, get_pixel_coords, + get_prediction_head, normal_activation, + upsample_via_bilinear, upsample_via_mask) + +PROJECT_DIR = os.path.split(os.path.dirname(os.path.realpath(__file__)))[0] +sys.path.append(PROJECT_DIR) + + +class NormalNet(nn.Module): + + def __init__(self, args): + super(NormalNet, self).__init__() + B = args.encoder_B + NF = args.decoder_NF + BN = args.decoder_BN + learned_upsampling = args.learned_upsampling + + self.encoder = Encoder(B=B, pretrained=False, ckpt=args.encoder_path) + self.decoder = Decoder( + num_classes=args.output_dim, + B=B, + NF=NF, + BN=BN, + learned_upsampling=learned_upsampling) + + def forward(self, x, **kwargs): + return self.decoder(self.encoder(x), **kwargs) + + +class Decoder(nn.Module): + + def __init__(self, + num_classes=3, + B=5, + NF=2048, + BN=False, + learned_upsampling=True): + super(Decoder, self).__init__() + input_channels = [2048, 176, 64, 40, 24] + + UpSample = UpSampleBN if BN else UpSampleGN + features = NF + + self.conv2 = nn.Conv2d( + input_channels[0] + 2, + features, + kernel_size=1, + stride=1, + padding=0) + self.up1 = UpSample( + skip_input=features // 1 + input_channels[1] + 2, + output_features=features // 2, + align_corners=False) + self.up2 = UpSample( + skip_input=features // 2 + input_channels[2] + 2, + output_features=features // 4, + align_corners=False) + self.up3 = UpSample( + skip_input=features // 4 + input_channels[3] + 2, + output_features=features // 8, + align_corners=False) + self.up4 = UpSample( + skip_input=features // 8 + input_channels[4] + 2, + output_features=features // 16, + align_corners=False) + i_dim = features // 16 + + self.downsample_ratio = 2 + self.output_dim = num_classes + + self.pred_head = get_prediction_head(i_dim + 2, 128, num_classes) + if learned_upsampling: + self.mask_head = get_prediction_head( + i_dim + 2, 128, + 9 * self.downsample_ratio * self.downsample_ratio) + self.upsample_fn = upsample_via_mask + else: + self.mask_head = lambda a: None + self.upsample_fn = upsample_via_bilinear + + self.pixel_coords = get_pixel_coords(h=1024, w=1024).to(0) + + def ray_embedding(self, x, intrins, orig_H, orig_W): + B, _, H, W = x.shape + fu = intrins[:, 0, 0].unsqueeze(-1).unsqueeze(-1) * (W / orig_W) + cu = intrins[:, 0, 2].unsqueeze(-1).unsqueeze(-1) * (W / orig_W) + fv = intrins[:, 1, 1].unsqueeze(-1).unsqueeze(-1) * (H / orig_H) + cv = intrins[:, 1, 2].unsqueeze(-1).unsqueeze(-1) * (H / orig_H) + + uv = self.pixel_coords[:, :2, :H, :W].repeat(B, 1, 1, 1) + uv[:, 0, :, :] = (uv[:, 0, :, :] - cu) / fu + uv[:, 1, :, :] = (uv[:, 1, :, :] - cv) / fv + return torch.cat([x, uv], dim=1) + + def forward(self, features, intrins): + x_block0, x_block1, x_block2, x_block3, x_block4 = features[4], features[5], features[6], \ + features[8], features[11] + _, _, orig_H, orig_W = features[0].shape + + x_d0 = self.conv2( + self.ray_embedding(x_block4, intrins, orig_H, orig_W)) + x_d1 = self.up1(x_d0, + self.ray_embedding(x_block3, intrins, orig_H, orig_W)) + x_d2 = self.up2(x_d1, + self.ray_embedding(x_block2, intrins, orig_H, orig_W)) + x_d3 = self.up3(x_d2, + self.ray_embedding(x_block1, intrins, orig_H, orig_W)) + x_feat = self.up4( + x_d3, self.ray_embedding(x_block0, intrins, orig_H, orig_W)) + + out = self.pred_head( + self.ray_embedding(x_feat, intrins, orig_H, orig_W)) + out = normal_activation(out, elu_kappa=True) + mask = self.mask_head( + self.ray_embedding(x_feat, intrins, orig_H, orig_W)) + up_out = self.upsample_fn( + out, up_mask=mask, downsample_ratio=self.downsample_ratio) + up_out = normal_activation(up_out, elu_kappa=False) + return [up_out] diff --git a/modelscope/models/cv/human_normal_estimation/networks/submodules.py b/modelscope/models/cv/human_normal_estimation/networks/submodules.py new file mode 100644 index 000000000..32fbd0116 --- /dev/null +++ b/modelscope/models/cv/human_normal_estimation/networks/submodules.py @@ -0,0 +1,214 @@ +import geffnet +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +INPUT_CHANNELS_DICT = { + 0: [1280, 112, 40, 24, 16], + 1: [1280, 112, 40, 24, 16], + 2: [1408, 120, 48, 24, 16], + 3: [1536, 136, 48, 32, 24], + 4: [1792, 160, 56, 32, 24], + 5: [2048, 176, 64, 40, 24], + 6: [2304, 200, 72, 40, 32], + 7: [2560, 224, 80, 48, 32] +} + + +class Encoder(nn.Module): + + def __init__(self, B=5, pretrained=True, ckpt=None): + super(Encoder, self).__init__() + if ckpt: + basemodel = geffnet.create_model( + 'tf_efficientnet_b%s_ap' % B, + pretrained=pretrained, + checkpoint_path=ckpt) + else: + basemodel = geffnet.create_model( + 'tf_efficientnet_b%s_ap' % B, pretrained=pretrained) + + basemodel.global_pool = nn.Identity() + basemodel.classifier = nn.Identity() + self.original_model = basemodel + + def forward(self, x): + features = [x] + for k, v in self.original_model._modules.items(): + if k == 'blocks': + for ki, vi in v._modules.items(): + features.append(vi(features[-1])) + else: + features.append(v(features[-1])) + return features + + +class ConvGRU(nn.Module): + + def __init__(self, hidden_dim, input_dim, ks=3): + super().__init__() + p = (ks - 1) // 2 + self.convz = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, ks, padding=p) + self.convr = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, ks, padding=p) + self.convq = nn.Conv2d( + hidden_dim + input_dim, hidden_dim, ks, padding=p) + + def forward(self, h, x): + hx = torch.cat([h, x], dim=1) + z = torch.sigmoid(self.convz(hx)) + r = torch.sigmoid(self.convr(hx)) + q = torch.tanh(self.convq(torch.cat([r * h, x], dim=1))) + h = (1 - z) * h + z * q + return h + + +class UpSampleBN(nn.Module): + + def __init__(self, skip_input, output_features, align_corners=True): + super(UpSampleBN, self).__init__() + self._net = nn.Sequential( + nn.Conv2d( + skip_input, + output_features, + kernel_size=3, + stride=1, + padding=1), nn.BatchNorm2d(output_features), nn.LeakyReLU(), + nn.Conv2d( + output_features, + output_features, + kernel_size=3, + stride=1, + padding=1), nn.BatchNorm2d(output_features), nn.LeakyReLU()) + self.align_corners = align_corners + + def forward(self, x, concat_with): + up_x = F.interpolate( + x, + size=[concat_with.size(2), + concat_with.size(3)], + mode='bilinear', + align_corners=self.align_corners) + f = torch.cat([up_x, concat_with], dim=1) + return self._net(f) + + +class Conv2d_WS(nn.Conv2d): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super(Conv2d_WS, + self).__init__(in_channels, out_channels, kernel_size, stride, + padding, dilation, groups, bias) + + def forward(self, x): + weight = self.weight + weight_mean = weight.mean( + dim=1, keepdim=True).mean( + dim=2, keepdim=True).mean( + dim=3, keepdim=True) + weight = weight - weight_mean + std = weight.view(weight.size(0), -1).std(dim=1).view(-1, 1, 1, + 1) + 1e-5 + weight = weight / std.expand_as(weight) + return F.conv2d(x, weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) + + +class UpSampleGN(nn.Module): + + def __init__(self, skip_input, output_features, align_corners=True): + super(UpSampleGN, self).__init__() + self._net = nn.Sequential( + Conv2d_WS( + skip_input, + output_features, + kernel_size=3, + stride=1, + padding=1), nn.GroupNorm(8, output_features), nn.LeakyReLU(), + Conv2d_WS( + output_features, + output_features, + kernel_size=3, + stride=1, + padding=1), nn.GroupNorm(8, output_features), nn.LeakyReLU()) + self.align_corners = align_corners + + def forward(self, x, concat_with): + up_x = F.interpolate( + x, + size=[concat_with.size(2), + concat_with.size(3)], + mode='bilinear', + align_corners=self.align_corners) + f = torch.cat([up_x, concat_with], dim=1) + return self._net(f) + + +def upsample_via_bilinear(out, up_mask=None, downsample_ratio=None): + return F.interpolate( + out, + scale_factor=downsample_ratio, + mode='bilinear', + align_corners=False) + + +def upsample_via_mask(out, up_mask, downsample_ratio, padding='zero'): + """ + convex upsampling + """ + # out: low-resolution output (B, o_dim, H, W) + # up_mask: (B, 9*k*k, H, W) + k = downsample_ratio + + B, C, H, W = out.shape + up_mask = up_mask.view(B, 1, 9, k, k, H, W) + up_mask = torch.softmax(up_mask, dim=2) # (B, 1, 9, k, k, H, W) + + if padding == 'zero': + up_out = F.unfold(out, [3, 3], padding=1) + elif padding == 'replicate': + out = F.pad(out, pad=(1, 1, 1, 1), mode='replicate') + up_out = F.unfold(out, [3, 3], padding=0) + else: + raise Exception('invalid padding for convex upsampling') + + up_out = up_out.view(B, C, 9, 1, 1, H, W) + + up_out = torch.sum(up_mask * up_out, dim=2) + up_out = up_out.permute(0, 1, 4, 2, 5, 3) + return up_out.reshape(B, C, k * H, k * W) + + +def get_prediction_head(input_dim, hidden_dim, output_dim): + return nn.Sequential( + nn.Conv2d(input_dim, hidden_dim, 3, padding=1), nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim, hidden_dim, 1), nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim, output_dim, 1)) + + +# submodules copy from DSINE +def get_pixel_coords(h, w): + pixel_coords = np.ones((3, h, w)).astype(np.float32) + x_range = np.concatenate([np.arange(w).reshape(1, w)] * h, axis=0) + y_range = np.concatenate([np.arange(h).reshape(h, 1)] * w, axis=1) + pixel_coords[0, :, :] = x_range + 0.5 + pixel_coords[1, :, :] = y_range + 0.5 + return torch.from_numpy(pixel_coords).unsqueeze(0) + + +def normal_activation(out, elu_kappa=True): + normal, kappa = out[:, :3, :, :], out[:, 3:, :, :] + normal = F.normalize(normal, p=2, dim=1) + if elu_kappa: + kappa = F.elu(kappa) + 1.0 + return torch.cat([normal, kappa], dim=1) diff --git a/modelscope/models/cv/human_reconstruction/models/detectors.py b/modelscope/models/cv/human_reconstruction/models/detectors.py index 4f63dd8c7..0fc41ab9e 100644 --- a/modelscope/models/cv/human_reconstruction/models/detectors.py +++ b/modelscope/models/cv/human_reconstruction/models/detectors.py @@ -1,4 +1,4 @@ -# The implementation here is modified based on Pytorch, originally BSD License and publicly avaialbe at +# The implementation here is modified based on Pytorch, originally BSD License and publicly available at # https://github.com/pytorch/pytorch import numpy as np import torch diff --git a/modelscope/models/cv/human_reconstruction/models/geometry.py b/modelscope/models/cv/human_reconstruction/models/geometry.py index fa4a00a6b..43ef6da6c 100644 --- a/modelscope/models/cv/human_reconstruction/models/geometry.py +++ b/modelscope/models/cv/human_reconstruction/models/geometry.py @@ -1,4 +1,4 @@ -# The implementation here is modified based on PIFU, originally MIT License and publicly avaialbe at +# The implementation here is modified based on PIFU, originally MIT License and publicly available at # https://github.com/shunsukesaito/PIFu/blob/master/lib/geometry.py import torch @@ -44,7 +44,7 @@ def perspective(points, calib, transform=None): args: points: [B, 3, N] 3d points in world coordinates calib: [B, 3, 4] projection matrix - transform: [B, 2, 3] screen space trasnformation + transform: [B, 2, 3] screen space transformation return: [B, 3, N] 3d coordinates in screen space """ diff --git a/modelscope/models/cv/human_reconstruction/models/networks.py b/modelscope/models/cv/human_reconstruction/models/networks.py index 266237b6b..1ef8c801e 100644 --- a/modelscope/models/cv/human_reconstruction/models/networks.py +++ b/modelscope/models/cv/human_reconstruction/models/networks.py @@ -1,4 +1,4 @@ -# The implementation here is modified based on Pix2PixHD, originally BSD License and publicly avaialbe at +# The implementation here is modified based on Pix2PixHD, originally BSD License and publicly available at # https://github.com/NVIDIA/pix2pixHD import functools diff --git a/modelscope/models/cv/image_body_reshaping/person_info.py b/modelscope/models/cv/image_body_reshaping/person_info.py index 509a2ce30..d205ae9ec 100644 --- a/modelscope/models/cv/image_body_reshaping/person_info.py +++ b/modelscope/models/cv/image_body_reshaping/person_info.py @@ -15,7 +15,7 @@ class PersonInfo(object): def __init__(self, joints): self.joints = joints self.flow = None - self.pad_boder = False + self.pad_border = False self.height_expand = 0 self.width_expand = 0 self.coeff = 0.2 @@ -24,11 +24,11 @@ def __init__(self, joints): self.divider = 20 self.flow_scales = ['upper_2'] - def update_attribute(self, pad_boder, height_expand, width_expand): - self.pad_boder = pad_boder + def update_attribute(self, pad_border, height_expand, width_expand): + self.pad_border = pad_border self.height_expand = height_expand self.width_expand = width_expand - if pad_boder: + if pad_border: self.joints[:, 0] += width_expand self.joints[:, 1] += height_expand @@ -41,7 +41,7 @@ def pred_flow(self, img, flow_net, device): if len(img.shape) == 2: img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) - if self.pad_boder: + if self.pad_border: height_expand = self.height_expand width_expand = self.width_expand pad_img = cv2.copyMakeBorder( diff --git a/modelscope/models/cv/image_body_reshaping/slim_utils.py b/modelscope/models/cv/image_body_reshaping/slim_utils.py index 23d5a741f..4ee0a6120 100644 --- a/modelscope/models/cv/image_body_reshaping/slim_utils.py +++ b/modelscope/models/cv/image_body_reshaping/slim_utils.py @@ -439,10 +439,10 @@ def get_heatmap_cv(img, magn, max_flow_mag): return cv_out -def save_heatmap_cv(img, flow, supression=2): +def save_heatmap_cv(img, flow, suppression=2): flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2) - flow_magn -= supression + flow_magn -= suppression flow_magn[flow_magn <= 0] = 0 cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3) return cv_out diff --git a/modelscope/models/cv/image_classification/backbones/beit_v2.py b/modelscope/models/cv/image_classification/backbones/beit_v2.py index eda117279..a567eada8 100644 --- a/modelscope/models/cv/image_classification/backbones/beit_v2.py +++ b/modelscope/models/cv/image_classification/backbones/beit_v2.py @@ -41,7 +41,7 @@ def forward(self, x): x = self.fc1(x) x = self.act(x) # x = self.drop(x) - # commit this for the orignal BERT implement + # commit this for the original BERT implement x = self.fc2(x) x = self.drop(x) return x diff --git a/modelscope/models/cv/image_color_enhance/adaint/adaint.py b/modelscope/models/cv/image_color_enhance/adaint/adaint.py index 8839f03a9..6977cb5a9 100644 --- a/modelscope/models/cv/image_color_enhance/adaint/adaint.py +++ b/modelscope/models/cv/image_color_enhance/adaint/adaint.py @@ -92,7 +92,7 @@ class Res18Backbone(nn.Module): r"""The ResNet-18 backbone. Args: - pretrained (bool, optional): Whether to use the torchvison pretrained weights. + pretrained (bool, optional): Whether to use the torchvision pretrained weights. Default: True. input_resolution (int, optional): Resolution for pre-downsampling. Default: 224. extra_pooling (bool, optional): [ignore]. @@ -312,7 +312,7 @@ def init_weights(self): and bias, respectively. """ - def special_initilization(m): + def special_initialization(m): classname = m.__class__.__name__ if 'Conv' in classname: nn.init.xavier_normal_(m.weight.data) @@ -321,7 +321,7 @@ def special_initilization(m): nn.init.constant_(m.bias.data, 0.0) if self.backbone_name not in ['res18']: - self.apply(special_initilization) + self.apply(special_initialization) self.lut_generator.init_weights() if self.en_adaint: self.adaint.init_weights() diff --git a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py index bc118ff21..de279d1c0 100644 --- a/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py +++ b/modelscope/models/cv/image_defrcn_fewshot/utils/requirements_check.py @@ -56,7 +56,7 @@ def is_torch_version_available(): `pip install torch==1.11` """ -REQUIREMENTS_MAAPING_VERSION = OrderedDict([ +REQUIREMENTS_MAPPING_VERSION = OrderedDict([ ('detectron2-0.3', (is_detectron2_version_available, DETECTRON2_IMPORT_ERROR)), ('torch-1.11', (is_torch_version_available, TORCH_VERSION_IMPORT_ERROR)), @@ -68,8 +68,8 @@ def is_torch_version_available(): def requires_version(): checks = [] for req in REQUIREMENTS: - if req in REQUIREMENTS_MAAPING_VERSION: - check = REQUIREMENTS_MAAPING_VERSION[req] + if req in REQUIREMENTS_MAPPING_VERSION: + check = REQUIREMENTS_MAPPING_VERSION[req] else: raise NotImplementedError('{} do not supported check'.format(req)) checks.append(check) diff --git a/modelscope/models/cv/image_depth_estimation_marigold/__init__.py b/modelscope/models/cv/image_depth_estimation_marigold/__init__.py new file mode 100644 index 000000000..15e4c01eb --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation_marigold/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .marigold import MarigoldDepthOutput + from .marigold_utils import (chw2hwc, colorize_depth_maps, ensemble_depths, + find_batch_size, inter_distances, + resize_max_res) +else: + _import_structure = { + 'marigold': ['MarigoldDepthOutput'], + 'marigold_utils': [ + 'find_batch_size', 'inter_distances', 'ensemble_depths', + 'colorize_depth_maps', 'chw2hwc', 'resize_max_res' + ] + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_depth_estimation_marigold/marigold.py b/modelscope/models/cv/image_depth_estimation_marigold/marigold.py new file mode 100644 index 000000000..a597b68c0 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation_marigold/marigold.py @@ -0,0 +1,42 @@ +# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -------------------------------------------------------------------------- +# If you find this code useful, we kindly ask you to cite our paper in your work. +# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation +# More information about the method can be found at https://marigoldmonodepth.github.io +# -------------------------------------------------------------------------- + +from typing import Dict, Union + +import numpy as np +from diffusers.utils import BaseOutput +from PIL import Image + + +class MarigoldDepthOutput(BaseOutput): + """ + Output class for Marigold monocular depth prediction pipeline. + + Args: + depth_np (`np.ndarray`): + Predicted depth map, with depth values in the range of [0, 1]. + depth_colored (`PIL.Image.Image`): + Colorized depth map, with the shape of [3, H, W] and values in [0, 1]. + uncertainty (`None` or `np.ndarray`): + Uncalibrated uncertainty(MAD, median absolute deviation) coming from ensembling. + """ + + depth_np: np.ndarray + depth_colored: Image.Image + uncertainty: Union[None, np.ndarray] diff --git a/modelscope/models/cv/image_depth_estimation_marigold/marigold_utils.py b/modelscope/models/cv/image_depth_estimation_marigold/marigold_utils.py new file mode 100644 index 000000000..00bceafe0 --- /dev/null +++ b/modelscope/models/cv/image_depth_estimation_marigold/marigold_utils.py @@ -0,0 +1,364 @@ +# Copyright 2023 Bingxin Ke, ETH Zurich. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# -------------------------------------------------------------------------- +# If you find this code useful, we kindly ask you to cite our paper in your work. +# Please find bibtex at: https://github.com/prs-eth/Marigold#-citation +# More information about the method can be found at https://marigoldmonodepth.github.io + +import math + +import matplotlib +import numpy as np +import torch +from PIL import Image +from scipy.optimize import minimize + +# Search table for suggested max. inference batch size +bs_search_table = [ + # tested on A100-PCIE-80GB + { + 'res': 768, + 'total_vram': 79, + 'bs': 35, + 'dtype': torch.float32 + }, + { + 'res': 1024, + 'total_vram': 79, + 'bs': 20, + 'dtype': torch.float32 + }, + # tested on A100-PCIE-40GB + { + 'res': 768, + 'total_vram': 39, + 'bs': 15, + 'dtype': torch.float32 + }, + { + 'res': 1024, + 'total_vram': 39, + 'bs': 8, + 'dtype': torch.float32 + }, + { + 'res': 768, + 'total_vram': 39, + 'bs': 30, + 'dtype': torch.float16 + }, + { + 'res': 1024, + 'total_vram': 39, + 'bs': 15, + 'dtype': torch.float16 + }, + # tested on RTX3090, RTX4090 + { + 'res': 512, + 'total_vram': 23, + 'bs': 20, + 'dtype': torch.float32 + }, + { + 'res': 768, + 'total_vram': 23, + 'bs': 7, + 'dtype': torch.float32 + }, + { + 'res': 1024, + 'total_vram': 23, + 'bs': 3, + 'dtype': torch.float32 + }, + { + 'res': 512, + 'total_vram': 23, + 'bs': 40, + 'dtype': torch.float16 + }, + { + 'res': 768, + 'total_vram': 23, + 'bs': 18, + 'dtype': torch.float16 + }, + { + 'res': 1024, + 'total_vram': 23, + 'bs': 10, + 'dtype': torch.float16 + }, + # tested on GTX1080Ti + { + 'res': 512, + 'total_vram': 10, + 'bs': 5, + 'dtype': torch.float32 + }, + { + 'res': 768, + 'total_vram': 10, + 'bs': 2, + 'dtype': torch.float32 + }, + { + 'res': 512, + 'total_vram': 10, + 'bs': 10, + 'dtype': torch.float16 + }, + { + 'res': 768, + 'total_vram': 10, + 'bs': 5, + 'dtype': torch.float16 + }, + { + 'res': 1024, + 'total_vram': 10, + 'bs': 3, + 'dtype': torch.float16 + }, +] + + +def find_batch_size(ensemble_size: int, input_res: int, + dtype: torch.dtype) -> int: + """ + Automatically search for suitable operating batch size. + + Args: + ensemble_size (`int`): + Number of predictions to be ensembled. + input_res (`int`): + Operating resolution of the input image. + + Returns: + `int`: Operating batch size. + """ + if not torch.cuda.is_available(): + return 1 + + total_vram = torch.cuda.mem_get_info()[1] / 1024.0**3 + filtered_bs_search_table = [ + s for s in bs_search_table if s['dtype'] == dtype + ] + for settings in sorted( + filtered_bs_search_table, + key=lambda k: (k['res'], -k['total_vram']), + ): + if input_res <= settings['res'] and total_vram >= settings[ + 'total_vram']: + bs = settings['bs'] + if bs > ensemble_size: + bs = ensemble_size + elif bs > math.ceil(ensemble_size / 2) and bs < ensemble_size: + bs = math.ceil(ensemble_size / 2) + return bs + + return 1 + + +def inter_distances(tensors: torch.Tensor): + """ + To calculate the distance between each two depth maps. + """ + distances = [] + for i, j in torch.combinations(torch.arange(tensors.shape[0])): + arr1 = tensors[i:i + 1] + arr2 = tensors[j:j + 1] + distances.append(arr1 - arr2) + dist = torch.concatenate(distances, dim=0) + return dist + + +def ensemble_depths( + input_images: torch.Tensor, + regularizer_strength: float = 0.02, + max_iter: int = 2, + tol: float = 1e-3, + reduction: str = 'median', + max_res: int = None, +): + """ + To ensemble multiple affine-invariant depth images (up to scale and shift), + by aligning estimating the scale and shift + """ + device = input_images.device + dtype = input_images.dtype + np_dtype = np.float32 + + original_input = input_images.clone() + n_img = input_images.shape[0] + ori_shape = input_images.shape + + if max_res is not None: + scale_factor = torch.min(max_res / torch.tensor(ori_shape[-2:])) + if scale_factor < 1: + downscaler = torch.nn.Upsample( + scale_factor=scale_factor, mode='nearest') + input_images = downscaler(torch.from_numpy(input_images)).numpy() + + # init guess + _min = np.min(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1) + _max = np.max(input_images.reshape((n_img, -1)).cpu().numpy(), axis=1) + s_init = 1.0 / (_max - _min).reshape((-1, 1, 1)) + t_init = (-1 * s_init.flatten() * _min.flatten()).reshape((-1, 1, 1)) + x = np.concatenate([s_init, t_init]).reshape(-1).astype(np_dtype) + + input_images = input_images.to(device) + + # objective function + def closure(x): + length = len(x) + s = x[:int(length / 2)] + t = x[int(length / 2):] + s = torch.from_numpy(s).to(dtype=dtype).to(device) + t = torch.from_numpy(t).to(dtype=dtype).to(device) + + transformed_arrays = input_images * s.view((-1, 1, 1)) + t.view( + (-1, 1, 1)) + dists = inter_distances(transformed_arrays) + sqrt_dist = torch.sqrt(torch.mean(dists**2)) + + if 'mean' == reduction: + pred = torch.mean(transformed_arrays, dim=0) + elif 'median' == reduction: + pred = torch.median(transformed_arrays, dim=0).values + else: + raise ValueError + + near_err = torch.sqrt((0 - torch.min(pred))**2) + far_err = torch.sqrt((1 - torch.max(pred))**2) + + err = sqrt_dist + (near_err + far_err) * regularizer_strength + err = err.detach().cpu().numpy().astype(np_dtype) + return err + + res = minimize( + closure, + x, + method='BFGS', + tol=tol, + options={ + 'maxiter': max_iter, + 'disp': False + }) + x = res.x + length = len(x) + s = x[:int(length / 2)] + t = x[int(length / 2):] + + # Prediction + s = torch.from_numpy(s).to(dtype=dtype).to(device) + t = torch.from_numpy(t).to(dtype=dtype).to(device) + transformed_arrays = original_input * s.view(-1, 1, 1) + t.view(-1, 1, 1) + if 'mean' == reduction: + aligned_images = torch.mean(transformed_arrays, dim=0) + std = torch.std(transformed_arrays, dim=0) + uncertainty = std + elif 'median' == reduction: + aligned_images = torch.median(transformed_arrays, dim=0).values + # MAD (median absolute deviation) as uncertainty indicator + abs_dev = torch.abs(transformed_arrays - aligned_images) + mad = torch.median(abs_dev, dim=0).values + uncertainty = mad + else: + raise ValueError(f'Unknown reduction method: {reduction}') + + # Scale and shift to [0, 1] + _min = torch.min(aligned_images) + _max = torch.max(aligned_images) + aligned_images = (aligned_images - _min) / (_max - _min) + uncertainty /= _max - _min + + return aligned_images, uncertainty + + +def colorize_depth_maps(depth_map, + min_depth, + max_depth, + cmap='Spectral', + valid_mask=None): + """ + Colorize depth maps. + """ + assert len(depth_map.shape) >= 2, 'Invalid dimension' + + if isinstance(depth_map, torch.Tensor): + depth = depth_map.detach().clone().squeeze().numpy() + elif isinstance(depth_map, np.ndarray): + depth = depth_map.copy().squeeze() + # reshape to [ (B,) H, W ] + if depth.ndim < 3: + depth = depth[np.newaxis, :, :] + + # colorize + cm = matplotlib.colormaps[cmap] + depth = ((depth - min_depth) / (max_depth - min_depth)).clip(0, 1) + img_colored_np = cm(depth, bytes=False)[:, :, :, 0:3] # value from 0 to 1 + img_colored_np = np.rollaxis(img_colored_np, 3, 1) + + if valid_mask is not None: + if isinstance(depth_map, torch.Tensor): + valid_mask = valid_mask.detach().numpy() + valid_mask = valid_mask.squeeze() # [H, W] or [B, H, W] + if valid_mask.ndim < 3: + valid_mask = valid_mask[np.newaxis, np.newaxis, :, :] + else: + valid_mask = valid_mask[:, np.newaxis, :, :] + valid_mask = np.repeat(valid_mask, 3, axis=1) + img_colored_np[~valid_mask] = 0 + + if isinstance(depth_map, torch.Tensor): + img_colored = torch.from_numpy(img_colored_np).float() + elif isinstance(depth_map, np.ndarray): + img_colored = img_colored_np + + return img_colored + + +def chw2hwc(chw): + assert 3 == len(chw.shape) + if isinstance(chw, torch.Tensor): + hwc = torch.permute(chw, (1, 2, 0)) + elif isinstance(chw, np.ndarray): + hwc = np.moveaxis(chw, 0, -1) + return hwc + + +def resize_max_res(img: Image.Image, max_edge_resolution: int) -> Image.Image: + """ + Resize image to limit maximum edge length while keeping aspect ratio. + + Args: + img (`Image.Image`): + Image to be resized. + max_edge_resolution (`int`): + Maximum edge length (pixel). + + Returns: + `Image.Image`: Resized image. + """ + original_width, original_height = img.size + downscale_factor = min(max_edge_resolution / original_width, + max_edge_resolution / original_height) + + new_width = int(original_width * downscale_factor) + new_height = int(original_height * downscale_factor) + + resized_img = img.resize((new_width, new_height)) + return resized_img diff --git a/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py b/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py index e29ad2b9e..9aa0dc053 100644 --- a/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py +++ b/modelscope/models/cv/image_driving_perception/image_driving_percetion_model.py @@ -22,7 +22,7 @@ Tasks.image_driving_perception, module_name=Models.yolopv2) class YOLOPv2(TorchModel): """ YOLOPv2 use E-ELAN which first adopted in Yolov7 as backbone, SPP+FPN+PAN as neck and head. - For more infomation, please refer to https://arxiv.org/pdf/2208.11434.pdf + For more information, please refer to https://arxiv.org/pdf/2208.11434.pdf """ def __init__(self, model_dir: str, *args, **kwargs): diff --git a/modelscope/models/cv/image_driving_perception/preprocessor.py b/modelscope/models/cv/image_driving_perception/preprocessor.py index 3e0e476fd..2bb84eb3a 100644 --- a/modelscope/models/cv/image_driving_perception/preprocessor.py +++ b/modelscope/models/cv/image_driving_perception/preprocessor.py @@ -92,7 +92,7 @@ def __call__( Args: data (str): image path Returns: - Dict[ndarry, Any]: the preprocessed data + Dict[ndarray, Any]: the preprocessed data { "img": the preprocessed resized image (640x640) } diff --git a/modelscope/models/cv/image_editing/__init__.py b/modelscope/models/cv/image_editing/__init__.py index 35341a189..8b77bd0ac 100644 --- a/modelscope/models/cv/image_editing/__init__.py +++ b/modelscope/models/cv/image_editing/__init__.py @@ -5,11 +5,11 @@ if TYPE_CHECKING: from .masactrl import MutualSelfAttentionControl - from .masactrl_utils import regiter_attention_editor_diffusers + from .masactrl_utils import register_attention_editor_diffusers else: _import_structure = { 'masactrl': ['MutualSelfAttentionControl'], - 'masactrl_utils': ['regiter_attention_editor_diffusers'] + 'masactrl_utils': ['register_attention_editor_diffusers'] } import sys diff --git a/modelscope/models/cv/image_editing/masactrl_utils.py b/modelscope/models/cv/image_editing/masactrl_utils.py index a59e987f6..b74ff13f6 100644 --- a/modelscope/models/cv/image_editing/masactrl_utils.py +++ b/modelscope/models/cv/image_editing/masactrl_utils.py @@ -41,7 +41,7 @@ def reset(self): self.cur_att_layer = 0 -def regiter_attention_editor_diffusers(model, editor: AttentionBase): +def register_attention_editor_diffusers(model, editor: AttentionBase): """ Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt] """ diff --git a/modelscope/models/cv/image_local_feature_matching/__init__.py b/modelscope/models/cv/image_local_feature_matching/__init__.py new file mode 100644 index 000000000..eecc611ec --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .loftr_model import LocalFeatureMatching + +else: + _import_structure = { + 'loftr_image_local_feature_matching': ['LocalFeatureMatching'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_local_feature_matching/loftr_model.py b/modelscope/models/cv/image_local_feature_matching/loftr_model.py new file mode 100644 index 000000000..d47b9da2a --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/loftr_model.py @@ -0,0 +1,79 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import io +import os.path as osp +from copy import deepcopy + +import cv2 +import matplotlib.cm as cm +import numpy as np +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.image_local_feature_matching.src.loftr import ( + LoFTR, default_cfg) +from modelscope.models.cv.image_local_feature_matching.src.utils.plotting import \ + make_matching_figure +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks + + +@MODELS.register_module( + Tasks.image_local_feature_matching, + module_name=Models.loftr_image_local_feature_matching) +class LocalFeatureMatching(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + # build model + # Initialize LoFTR + _default_cfg = deepcopy(default_cfg) + self.model = LoFTR(config=_default_cfg) + + # load model + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + checkpoint = torch.load(model_path, map_location='cpu') + self.model.load_state_dict(checkpoint['state_dict']) + self.model.eval() + + def forward(self, Inputs): + self.model(Inputs) + result = { + 'kpts0': Inputs['mkpts0_f'], + 'kpts1': Inputs['mkpts1_f'], + 'conf': Inputs['mconf'], + } + Inputs.update(result) + return Inputs + + def postprocess(self, Inputs): + # Draw + color = cm.jet(Inputs['conf'].cpu().numpy()) + img0, img1, mkpts0, mkpts1 = Inputs['image0'].squeeze().cpu().numpy( + ), Inputs['image1'].squeeze().cpu().numpy(), Inputs['kpts0'].cpu( + ).numpy(), Inputs['kpts1'].cpu().numpy() + text = [ + 'LoFTR', + 'Matches: {}'.format(len(Inputs['kpts0'])), + ] + img0, img1 = (img0 * 255).astype(np.uint8), (img1 * 255).astype( + np.uint8) + fig = make_matching_figure( + img0, img1, mkpts0, mkpts1, color, text=text) + io_buf = io.BytesIO() + fig.savefig(io_buf, format='png', dpi=75) + io_buf.seek(0) + buf_data = np.frombuffer(io_buf.getvalue(), dtype=np.uint8) + io_buf.close() + vis_img = cv2.imdecode(buf_data, 1) + + results = {OutputKeys.MATCHES: Inputs, OutputKeys.OUTPUT_IMG: vis_img} + return results + + def inference(self, data): + results = self.forward(data) + + return results diff --git a/modelscope/models/cv/image_local_feature_matching/src/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/__init__.py new file mode 100644 index 000000000..0d69b9c13 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/__init__.py @@ -0,0 +1,2 @@ +from .loftr import LoFTR +from .utils.cvpr_ds_config import default_cfg diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py new file mode 100644 index 000000000..af4f526dd --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/__init__.py @@ -0,0 +1,12 @@ +from .resnet_fpn import ResNetFPN_8_2, ResNetFPN_16_4 + + +def build_backbone(config): + if config['backbone_type'] == 'ResNetFPN': + if config['resolution'] == (8, 2): + return ResNetFPN_8_2(config['resnetfpn']) + elif config['resolution'] == (16, 4): + return ResNetFPN_16_4(config['resnetfpn']) + else: + raise ValueError( + f"LOFTR.BACKBONE_TYPE {config['backbone_type']} not supported.") diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py new file mode 100644 index 000000000..ea7583d18 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/backbone/resnet_fpn.py @@ -0,0 +1,219 @@ +import torch.nn as nn +import torch.nn.functional as F + + +def conv1x1(in_planes, out_planes, stride=1): + """1x1 convolution without padding""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=1, + stride=stride, + padding=0, + bias=False) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=1, + bias=False) + + +class BasicBlock(nn.Module): + + def __init__(self, in_planes, planes, stride=1): + super().__init__() + self.conv1 = conv3x3(in_planes, planes, stride) + self.conv2 = conv3x3(planes, planes) + self.bn1 = nn.BatchNorm2d(planes) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + + if stride == 1: + self.downsample = None + else: + self.downsample = nn.Sequential( + conv1x1(in_planes, planes, stride=stride), + nn.BatchNorm2d(planes)) + + def forward(self, x): + y = x + y = self.relu(self.bn1(self.conv1(y))) + y = self.bn2(self.conv2(y)) + + if self.downsample is not None: + x = self.downsample(x) + + return self.relu(x + y) + + +class ResNetFPN_8_2(nn.Module): + """ + ResNet+FPN, output resolution are 1/8 and 1/2. + Each block has 2 layers. + """ + + def __init__(self, config): + super().__init__() + # Config + block = BasicBlock + initial_dim = config['initial_dim'] + block_dims = config['block_dims'] + + # Class Variable + self.in_planes = initial_dim + + # Networks + self.conv1 = nn.Conv2d( + 1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(initial_dim) + self.relu = nn.ReLU(inplace=True) + + self.layer1 = self._make_layer(block, block_dims[0], stride=1) # 1/2 + self.layer2 = self._make_layer(block, block_dims[1], stride=2) # 1/4 + self.layer3 = self._make_layer(block, block_dims[2], stride=2) # 1/8 + + # 3. FPN upsample + self.layer3_outconv = conv1x1(block_dims[2], block_dims[2]) + self.layer2_outconv = conv1x1(block_dims[1], block_dims[2]) + self.layer2_outconv2 = nn.Sequential( + conv3x3(block_dims[2], block_dims[2]), + nn.BatchNorm2d(block_dims[2]), + nn.LeakyReLU(), + conv3x3(block_dims[2], block_dims[1]), + ) + self.layer1_outconv = conv1x1(block_dims[0], block_dims[1]) + self.layer1_outconv2 = nn.Sequential( + conv3x3(block_dims[1], block_dims[1]), + nn.BatchNorm2d(block_dims[1]), + nn.LeakyReLU(), + conv3x3(block_dims[1], block_dims[0]), + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, block, dim, stride=1): + layer1 = block(self.in_planes, dim, stride=stride) + layer2 = block(dim, dim, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, x): + # ResNet Backbone + x0 = self.relu(self.bn1(self.conv1(x))) + x1 = self.layer1(x0) # 1/2 + x2 = self.layer2(x1) # 1/4 + x3 = self.layer3(x2) # 1/8 + + # FPN + x3_out = self.layer3_outconv(x3) + + x3_out_2x = F.interpolate( + x3_out, scale_factor=2., mode='bilinear', align_corners=True) + x2_out = self.layer2_outconv(x2) + x2_out = self.layer2_outconv2(x2_out + x3_out_2x) + + x2_out_2x = F.interpolate( + x2_out, scale_factor=2., mode='bilinear', align_corners=True) + x1_out = self.layer1_outconv(x1) + x1_out = self.layer1_outconv2(x1_out + x2_out_2x) + + return [x3_out, x1_out] + + +class ResNetFPN_16_4(nn.Module): + """ + ResNet+FPN, output resolution are 1/16 and 1/4. + Each block has 2 layers. + """ + + def __init__(self, config): + super().__init__() + # Config + block = BasicBlock + initial_dim = config['initial_dim'] + block_dims = config['block_dims'] + + # Class Variable + self.in_planes = initial_dim + + # Networks + self.conv1 = nn.Conv2d( + 1, initial_dim, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(initial_dim) + self.relu = nn.ReLU(inplace=True) + + self.layer1 = self._make_layer(block, block_dims[0], stride=1) # 1/2 + self.layer2 = self._make_layer(block, block_dims[1], stride=2) # 1/4 + self.layer3 = self._make_layer(block, block_dims[2], stride=2) # 1/8 + self.layer4 = self._make_layer(block, block_dims[3], stride=2) # 1/16 + + # 3. FPN upsample + self.layer4_outconv = conv1x1(block_dims[3], block_dims[3]) + self.layer3_outconv = conv1x1(block_dims[2], block_dims[3]) + self.layer3_outconv2 = nn.Sequential( + conv3x3(block_dims[3], block_dims[3]), + nn.BatchNorm2d(block_dims[3]), + nn.LeakyReLU(), + conv3x3(block_dims[3], block_dims[2]), + ) + + self.layer2_outconv = conv1x1(block_dims[1], block_dims[2]) + self.layer2_outconv2 = nn.Sequential( + conv3x3(block_dims[2], block_dims[2]), + nn.BatchNorm2d(block_dims[2]), + nn.LeakyReLU(), + conv3x3(block_dims[2], block_dims[1]), + ) + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + def _make_layer(self, block, dim, stride=1): + layer1 = block(self.in_planes, dim, stride=stride) + layer2 = block(dim, dim, stride=1) + layers = (layer1, layer2) + + self.in_planes = dim + return nn.Sequential(*layers) + + def forward(self, x): + # ResNet Backbone + x0 = self.relu(self.bn1(self.conv1(x))) + x1 = self.layer1(x0) # 1/2 + x2 = self.layer2(x1) # 1/4 + x3 = self.layer3(x2) # 1/8 + x4 = self.layer4(x3) # 1/16 + + # FPN + x4_out = self.layer4_outconv(x4) + + x4_out_2x = F.interpolate( + x4_out, scale_factor=2., mode='bilinear', align_corners=True) + x3_out = self.layer3_outconv(x3) + x3_out = self.layer3_outconv2(x3_out + x4_out_2x) + + x3_out_2x = F.interpolate( + x3_out, scale_factor=2., mode='bilinear', align_corners=True) + x2_out = self.layer2_outconv(x2) + x2_out = self.layer2_outconv2(x2_out + x3_out_2x) + + return [x4_out, x2_out] diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py new file mode 100644 index 000000000..34cac8879 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr.py @@ -0,0 +1,93 @@ +import torch +import torch.nn as nn +from einops.einops import rearrange + +from .backbone import build_backbone +from .loftr_module import FinePreprocess, LocalFeatureTransformer +from .utils.coarse_matching import CoarseMatching +from .utils.fine_matching import FineMatching +from .utils.position_encoding import PositionEncodingSine + + +class LoFTR(nn.Module): + + def __init__(self, config): + super().__init__() + # Misc + self.config = config + + # Modules + self.backbone = build_backbone(config) + self.pos_encoding = PositionEncodingSine( + config['coarse']['d_model'], + temp_bug_fix=config['coarse']['temp_bug_fix']) + self.loftr_coarse = LocalFeatureTransformer(config['coarse']) + self.coarse_matching = CoarseMatching(config['match_coarse']) + self.fine_preprocess = FinePreprocess(config) + self.loftr_fine = LocalFeatureTransformer(config['fine']) + self.fine_matching = FineMatching() + + def forward(self, data): + """ + Update: + data (dict): { + 'image0': (torch.Tensor): (N, 1, H, W) + 'image1': (torch.Tensor): (N, 1, H, W) + 'mask0'(optional) : (torch.Tensor): (N, H, W) '0' indicates a padded position + 'mask1'(optional) : (torch.Tensor): (N, H, W) + } + """ + # 1. Local Feature CNN + data.update({ + 'bs': data['image0'].size(0), + 'hw0_i': data['image0'].shape[2:], + 'hw1_i': data['image1'].shape[2:] + }) + + if data['hw0_i'] == data['hw1_i']: # faster & better BN convergence + feats_c, feats_f = self.backbone( + torch.cat([data['image0'], data['image1']], dim=0)) + (feat_c0, feat_c1), (feat_f0, feat_f1) = feats_c.split( + data['bs']), feats_f.split(data['bs']) + else: # handle different input shapes + (feat_c0, feat_f0), (feat_c1, feat_f1) = self.backbone( + data['image0']), self.backbone(data['image1']) + + data.update({ + 'hw0_c': feat_c0.shape[2:], + 'hw1_c': feat_c1.shape[2:], + 'hw0_f': feat_f0.shape[2:], + 'hw1_f': feat_f1.shape[2:] + }) + + # 2. coarse-level loftr module + # add featmap with positional encoding, then flatten it to sequence [N, HW, C] + feat_c0 = rearrange(self.pos_encoding(feat_c0), 'n c h w -> n (h w) c') + feat_c1 = rearrange(self.pos_encoding(feat_c1), 'n c h w -> n (h w) c') + + mask_c0 = mask_c1 = None # mask is useful in training + if 'mask0' in data: + mask_c0, mask_c1 = data['mask0'].flatten( + -2), data['mask1'].flatten(-2) + feat_c0, feat_c1 = self.loftr_coarse(feat_c0, feat_c1, mask_c0, + mask_c1) + + # 3. match coarse-level + self.coarse_matching( + feat_c0, feat_c1, data, mask_c0=mask_c0, mask_c1=mask_c1) + + # 4. fine-level refinement + feat_f0_unfold, feat_f1_unfold = self.fine_preprocess( + feat_f0, feat_f1, feat_c0, feat_c1, data) + if feat_f0_unfold.size(0) != 0: # at least one coarse level predicted + feat_f0_unfold, feat_f1_unfold = self.loftr_fine( + feat_f0_unfold, feat_f1_unfold) + + # 5. match fine-level + self.fine_matching(feat_f0_unfold, feat_f1_unfold, data) + + def load_state_dict(self, state_dict, *args, **kwargs): + for k in list(state_dict.keys()): + if k.startswith('matcher.'): + state_dict[k.replace('matcher.', '', 1)] = state_dict.pop(k) + return super().load_state_dict(state_dict, *args, **kwargs) diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py new file mode 100644 index 000000000..8d83af7e9 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/__init__.py @@ -0,0 +1,2 @@ +from .fine_preprocess import FinePreprocess +from .transformer import LocalFeatureTransformer diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py new file mode 100644 index 000000000..8624eab5e --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/fine_preprocess.py @@ -0,0 +1,73 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops.einops import rearrange, repeat + + +class FinePreprocess(nn.Module): + + def __init__(self, config): + super().__init__() + + self.config = config + self.cat_c_feat = config['fine_concat_coarse_feat'] + self.W = self.config['fine_window_size'] + + d_model_c = self.config['coarse']['d_model'] + d_model_f = self.config['fine']['d_model'] + self.d_model_f = d_model_f + if self.cat_c_feat: + self.down_proj = nn.Linear(d_model_c, d_model_f, bias=True) + self.merge_feat = nn.Linear(2 * d_model_f, d_model_f, bias=True) + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.kaiming_normal_(p, mode='fan_out', nonlinearity='relu') + + def forward(self, feat_f0, feat_f1, feat_c0, feat_c1, data): + W = self.W + stride = data['hw0_f'][0] // data['hw0_c'][0] + + data.update({'W': W}) + if data['b_ids'].shape[0] == 0: + feat0 = torch.empty( + 0, self.W**2, self.d_model_f, device=feat_f0.device) + feat1 = torch.empty( + 0, self.W**2, self.d_model_f, device=feat_f0.device) + return feat0, feat1 + + # 1. unfold(crop) all local windows + feat_f0_unfold = F.unfold( + feat_f0, kernel_size=(W, W), stride=stride, padding=W // 2) + feat_f0_unfold = rearrange( + feat_f0_unfold, 'n (c ww) l -> n l ww c', ww=W**2) + feat_f1_unfold = F.unfold( + feat_f1, kernel_size=(W, W), stride=stride, padding=W // 2) + feat_f1_unfold = rearrange( + feat_f1_unfold, 'n (c ww) l -> n l ww c', ww=W**2) + + # 2. select only the predicted matches + feat_f0_unfold = feat_f0_unfold[data['b_ids'], + data['i_ids']] # [n, ww, cf] + feat_f1_unfold = feat_f1_unfold[data['b_ids'], data['j_ids']] + + # option: use coarse-level loftr feature as context: concat and linear + if self.cat_c_feat: + feat_c_win = self.down_proj( + torch.cat([ + feat_c0[data['b_ids'], data['i_ids']], + feat_c1[data['b_ids'], data['j_ids']] + ], 0)) # [2n, c] + feat_cf_win = self.merge_feat( + torch.cat( + [ + torch.cat([feat_f0_unfold, feat_f1_unfold], + 0), # [2n, ww, cf] + repeat(feat_c_win, 'n c -> n ww c', ww = W ** 2), # [2n, ww, cf] + ], -1)) # yapf: disable + feat_f0_unfold, feat_f1_unfold = torch.chunk(feat_cf_win, 2, dim=0) + + return feat_f0_unfold, feat_f1_unfold diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py new file mode 100644 index 000000000..8e4f11d1d --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/linear_attention.py @@ -0,0 +1,86 @@ +""" +Linear Transformer proposed in "Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention" +Modified from: https://github.com/idiap/fast-transformers/blob/master/fast_transformers/attention/linear_attention.py +""" + +import torch +from torch.nn import Dropout, Module + + +def elu_feature_map(x): + return torch.nn.functional.elu(x) + 1 + + +class LinearAttention(Module): + + def __init__(self, eps=1e-6): + super().__init__() + self.feature_map = elu_feature_map + self.eps = eps + + def forward(self, queries, keys, values, q_mask=None, kv_mask=None): + """ Multi-Head linear attention proposed in "Transformers are RNNs" + Args: + queries: [N, L, H, D] + keys: [N, S, H, D] + values: [N, S, H, D] + q_mask: [N, L] + kv_mask: [N, S] + Returns: + queried_values: (N, L, H, D) + """ + Q = self.feature_map(queries) + K = self.feature_map(keys) + + # set padded position to zero + if q_mask is not None: + Q = Q * q_mask[:, :, None, None] + if kv_mask is not None: + K = K * kv_mask[:, :, None, None] + values = values * kv_mask[:, :, None, None] + + v_length = values.size(1) + values = values / v_length # prevent fp16 overflow + KV = torch.einsum('nshd,nshv->nhdv', K, values) # (S,D)' @ S,V + Z = 1 / (torch.einsum('nlhd,nhd->nlh', Q, K.sum(dim=1)) + self.eps) + queried_values = torch.einsum('nlhd,nhdv,nlh->nlhv', Q, KV, + Z) * v_length + + return queried_values.contiguous() + + +class FullAttention(Module): + + def __init__(self, use_dropout=False, attention_dropout=0.1): + super().__init__() + self.use_dropout = use_dropout + self.dropout = Dropout(attention_dropout) + + def forward(self, queries, keys, values, q_mask=None, kv_mask=None): + """ Multi-head scaled dot-product attention, a.k.a full attention. + Args: + queries: [N, L, H, D] + keys: [N, S, H, D] + values: [N, S, H, D] + q_mask: [N, L] + kv_mask: [N, S] + Returns: + queried_values: (N, L, H, D) + """ + + # Compute the unnormalized attention and apply the masks + QK = torch.einsum('nlhd,nshd->nlsh', queries, keys) + if kv_mask is not None: + QK.masked_fill_( + ~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]), + float('-inf')) + + # Compute the attention and the weighted average + softmax_temp = 1. / queries.size(3)**.5 # sqrt(D) + A = torch.softmax(softmax_temp * QK, dim=2) + if self.use_dropout: + A = self.dropout(A) + + queried_values = torch.einsum('nlsh,nshd->nlhd', A, values) + + return queried_values.contiguous() diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py new file mode 100644 index 000000000..4c28f20d7 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/loftr_module/transformer.py @@ -0,0 +1,111 @@ +import copy + +import torch +import torch.nn as nn + +from .linear_attention import FullAttention, LinearAttention + + +class LoFTREncoderLayer(nn.Module): + + def __init__(self, d_model, nhead, attention='linear'): + super(LoFTREncoderLayer, self).__init__() + + self.dim = d_model // nhead + self.nhead = nhead + + # multi-head attention + self.q_proj = nn.Linear(d_model, d_model, bias=False) + self.k_proj = nn.Linear(d_model, d_model, bias=False) + self.v_proj = nn.Linear(d_model, d_model, bias=False) + self.attention = LinearAttention( + ) if attention == 'linear' else FullAttention() + self.merge = nn.Linear(d_model, d_model, bias=False) + + # feed-forward network + self.mlp = nn.Sequential( + nn.Linear(d_model * 2, d_model * 2, bias=False), + nn.ReLU(True), + nn.Linear(d_model * 2, d_model, bias=False), + ) + + # norm and dropout + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + + def forward(self, x, source, x_mask=None, source_mask=None): + """ + Args: + x (torch.Tensor): [N, L, C] + source (torch.Tensor): [N, S, C] + x_mask (torch.Tensor): [N, L] (optional) + source_mask (torch.Tensor): [N, S] (optional) + """ + bs = x.size(0) + query, key, value = x, source, source + + # multi-head attention + query = self.q_proj(query).view(bs, -1, self.nhead, + self.dim) # [N, L, (H, D)] + key = self.k_proj(key).view(bs, -1, self.nhead, + self.dim) # [N, S, (H, D)] + value = self.v_proj(value).view(bs, -1, self.nhead, self.dim) + message = self.attention( + query, key, value, q_mask=x_mask, + kv_mask=source_mask) # [N, L, (H, D)] + message = self.merge(message.view(bs, -1, + self.nhead * self.dim)) # [N, L, C] + message = self.norm1(message) + + # feed-forward network + message = self.mlp(torch.cat([x, message], dim=2)) + message = self.norm2(message) + + return x + message + + +class LocalFeatureTransformer(nn.Module): + """A Local Feature Transformer (LoFTR) module.""" + + def __init__(self, config): + super(LocalFeatureTransformer, self).__init__() + + self.config = config + self.d_model = config['d_model'] + self.nhead = config['nhead'] + self.layer_names = config['layer_names'] + encoder_layer = LoFTREncoderLayer(config['d_model'], config['nhead'], + config['attention']) + self.layers = nn.ModuleList([ + copy.deepcopy(encoder_layer) for _ in range(len(self.layer_names)) + ]) + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, feat0, feat1, mask0=None, mask1=None): + """ + Args: + feat0 (torch.Tensor): [N, L, C] + feat1 (torch.Tensor): [N, S, C] + mask0 (torch.Tensor): [N, L] (optional) + mask1 (torch.Tensor): [N, S] (optional) + """ + + assert self.d_model == feat0.size( + 2), 'the feature number of src and transformer must be equal' + + for layer, name in zip(self.layers, self.layer_names): + if name == 'self': + feat0 = layer(feat0, feat0, mask0, mask0) + feat1 = layer(feat1, feat1, mask1, mask1) + elif name == 'cross': + feat0 = layer(feat0, feat1, mask0, mask1) + feat1 = layer(feat1, feat0, mask1, mask0) + else: + raise KeyError + + return feat0, feat1 diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py new file mode 100644 index 000000000..c78356898 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/coarse_matching.py @@ -0,0 +1,264 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops.einops import rearrange + +INF = 1e9 + + +def mask_border(m, b: int, v): + """ Mask borders with value + Args: + m (torch.Tensor): [N, H0, W0, H1, W1] + b (int) + v (m.dtype) + """ + if b <= 0: + return + + m[:, :b] = v + m[:, :, :b] = v + m[:, :, :, :b] = v + m[:, :, :, :, :b] = v + m[:, -b:] = v + m[:, :, -b:] = v + m[:, :, :, -b:] = v + m[:, :, :, :, -b:] = v + + +def mask_border_with_padding(m, bd, v, p_m0, p_m1): + if bd <= 0: + return + + m[:, :bd] = v + m[:, :, :bd] = v + m[:, :, :, :bd] = v + m[:, :, :, :, :bd] = v + + h0s, w0s = p_m0.sum(1).max(-1)[0].int(), p_m0.sum(-1).max(-1)[0].int() + h1s, w1s = p_m1.sum(1).max(-1)[0].int(), p_m1.sum(-1).max(-1)[0].int() + for b_idx, (h0, w0, h1, w1) in enumerate(zip(h0s, w0s, h1s, w1s)): + m[b_idx, h0 - bd:] = v + m[b_idx, :, w0 - bd:] = v + m[b_idx, :, :, h1 - bd:] = v + m[b_idx, :, :, :, w1 - bd:] = v + + +def compute_max_candidates(p_m0, p_m1): + """Compute the max candidates of all pairs within a batch + + Args: + p_m0, p_m1 (torch.Tensor): padded masks + """ + h0s, w0s = p_m0.sum(1).max(-1)[0], p_m0.sum(-1).max(-1)[0] + h1s, w1s = p_m1.sum(1).max(-1)[0], p_m1.sum(-1).max(-1)[0] + max_cand = torch.sum( + torch.min(torch.stack([h0s * w0s, h1s * w1s], -1), -1)[0]) + return max_cand + + +class CoarseMatching(nn.Module): + + def __init__(self, config): + super().__init__() + self.config = config + # general config + self.thr = config['thr'] + self.border_rm = config['border_rm'] + # -- # for trainig fine-level LoFTR + self.train_coarse_percent = config['train_coarse_percent'] + self.train_pad_num_gt_min = config['train_pad_num_gt_min'] + + # we provide 2 options for differentiable matching + self.match_type = config['match_type'] + if self.match_type == 'dual_softmax': + self.temperature = config['dsmax_temperature'] + elif self.match_type == 'sinkhorn': + try: + from .superglue import log_optimal_transport + except ImportError: + raise ImportError('download superglue.py first!') + self.log_optimal_transport = log_optimal_transport + self.bin_score = nn.Parameter( + torch.tensor(config['skh_init_bin_score'], requires_grad=True)) + self.skh_iters = config['skh_iters'] + self.skh_prefilter = config['skh_prefilter'] + else: + raise NotImplementedError() + + def forward(self, feat_c0, feat_c1, data, mask_c0=None, mask_c1=None): + """ + Args: + feat0 (torch.Tensor): [N, L, C] + feat1 (torch.Tensor): [N, S, C] + data (dict) + mask_c0 (torch.Tensor): [N, L] (optional) + mask_c1 (torch.Tensor): [N, S] (optional) + Update: + data (dict): { + 'b_ids' (torch.Tensor): [M'], + 'i_ids' (torch.Tensor): [M'], + 'j_ids' (torch.Tensor): [M'], + 'gt_mask' (torch.Tensor): [M'], + 'mkpts0_c' (torch.Tensor): [M, 2], + 'mkpts1_c' (torch.Tensor): [M, 2], + 'mconf' (torch.Tensor): [M]} + NOTE: M' != M during training. + """ + _, L, S, _ = feat_c0.size(0), feat_c0.size(1), feat_c1.size( + 1), feat_c0.size(2) + + # normalize + feat_c0, feat_c1 = map(lambda feat: feat / feat.shape[-1]**.5, + [feat_c0, feat_c1]) + + if self.match_type == 'dual_softmax': + sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0, + feat_c1) / self.temperature + if mask_c0 is not None: + sim_matrix.masked_fill_( + ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF) + conf_matrix = F.softmax(sim_matrix, 1) * F.softmax(sim_matrix, 2) + + elif self.match_type == 'sinkhorn': + # sinkhorn, dustbin included + sim_matrix = torch.einsum('nlc,nsc->nls', feat_c0, feat_c1) + if mask_c0 is not None: + sim_matrix[:, :L, :S].masked_fill_( + ~(mask_c0[..., None] * mask_c1[:, None]).bool(), -INF) + + # build uniform prior & use sinkhorn + log_assign_matrix = self.log_optimal_transport( + sim_matrix, self.bin_score, self.skh_iters) + assign_matrix = log_assign_matrix.exp() + conf_matrix = assign_matrix[:, :-1, :-1] + + # filter prediction with dustbin score (only in evaluation mode) + if not self.training and self.skh_prefilter: + filter0 = (assign_matrix.max(dim=2)[1] == S)[:, :-1] # [N, L] + filter1 = (assign_matrix.max(dim=1)[1] == L)[:, :-1] # [N, S] + conf_matrix[filter0[..., None].repeat(1, 1, S)] = 0 + conf_matrix[filter1[:, None].repeat(1, L, 1)] = 0 + + if self.config['sparse_spvs']: + data.update({'conf_matrix_with_bin': assign_matrix.clone()}) + + data.update({'conf_matrix': conf_matrix}) + + # predict coarse matches from conf_matrix + data.update(**self.get_coarse_match(conf_matrix, data)) + + @torch.no_grad() + def get_coarse_match(self, conf_matrix, data): + """ + Args: + conf_matrix (torch.Tensor): [N, L, S] + data (dict): with keys ['hw0_i', 'hw1_i', 'hw0_c', 'hw1_c'] + Returns: + coarse_matches (dict): { + 'b_ids' (torch.Tensor): [M'], + 'i_ids' (torch.Tensor): [M'], + 'j_ids' (torch.Tensor): [M'], + 'gt_mask' (torch.Tensor): [M'], + 'm_bids' (torch.Tensor): [M], + 'mkpts0_c' (torch.Tensor): [M, 2], + 'mkpts1_c' (torch.Tensor): [M, 2], + 'mconf' (torch.Tensor): [M]} + """ + axes_lengths = { + 'h0c': data['hw0_c'][0], + 'w0c': data['hw0_c'][1], + 'h1c': data['hw1_c'][0], + 'w1c': data['hw1_c'][1] + } + _device = conf_matrix.device + # 1. confidence thresholding + mask = conf_matrix > self.thr + mask = rearrange(mask, 'b (h0c w0c) (h1c w1c) -> b h0c w0c h1c w1c', + **axes_lengths) + if 'mask0' not in data: + mask_border(mask, self.border_rm, False) + else: + mask_border_with_padding(mask, self.border_rm, False, + data['mask0'], data['mask1']) + mask = rearrange(mask, 'b h0c w0c h1c w1c -> b (h0c w0c) (h1c w1c)', + **axes_lengths) + + # 2. mutual nearest + mask = mask \ + * (conf_matrix == conf_matrix.max(dim=2, keepdim=True)[0]) \ + * (conf_matrix == conf_matrix.max(dim=1, keepdim=True)[0]) + + # 3. find all valid coarse matches + # this only works when at most one `True` in each row + mask_v, all_j_ids = mask.max(dim=2) + b_ids, i_ids = torch.where(mask_v) + j_ids = all_j_ids[b_ids, i_ids] + mconf = conf_matrix[b_ids, i_ids, j_ids] + + # 4. Random sampling of training samples for fine-level LoFTR + # (optional) pad samples with gt coarse-level matches + if self.training: + # NOTE: + # The sampling is performed across all pairs in a batch without manually balancing + # #samples for fine-level increases w.r.t. batch_size + if 'mask0' not in data: + num_candidates_max = mask.size(0) * max( + mask.size(1), mask.size(2)) + else: + num_candidates_max = compute_max_candidates( + data['mask0'], data['mask1']) + num_matches_train = int(num_candidates_max + * self.train_coarse_percent) + num_matches_pred = len(b_ids) + assert self.train_pad_num_gt_min < num_matches_train, 'min-num-gt-pad should be less than num-train-matches' + + # pred_indices is to select from prediction + if num_matches_pred <= num_matches_train - self.train_pad_num_gt_min: + pred_indices = torch.arange(num_matches_pred, device=_device) + else: + pred_indices = torch.randint( + num_matches_pred, + (num_matches_train - self.train_pad_num_gt_min, ), + device=_device) + + # gt_pad_indices is to select from gt padding. e.g. max(3787-4800, 200) + gt_pad_indices = torch.randint( + len(data['spv_b_ids']), + (max(num_matches_train - num_matches_pred, + self.train_pad_num_gt_min), ), + device=_device) + mconf_gt = torch.zeros( + len(data['spv_b_ids']), + device=_device) # set conf of gt paddings to all zero + + b_ids, i_ids, j_ids, mconf = map( + lambda x, y: torch.cat([x[pred_indices], y[gt_pad_indices]], + dim=0), + *zip([b_ids, data['spv_b_ids']], [i_ids, data['spv_i_ids']], + [j_ids, data['spv_j_ids']], [mconf, mconf_gt])) + + # These matches select patches that feed into fine-level network + coarse_matches = {'b_ids': b_ids, 'i_ids': i_ids, 'j_ids': j_ids} + + # 4. Update with matches in original image resolution + scale = data['hw0_i'][0] / data['hw0_c'][0] + scale0 = scale * data['scale0'][b_ids] if 'scale0' in data else scale + scale1 = scale * data['scale1'][b_ids] if 'scale1' in data else scale + mkpts0_c = torch.stack( + [i_ids % data['hw0_c'][1], i_ids // data['hw0_c'][1]], + dim=1) * scale0 + mkpts1_c = torch.stack( + [j_ids % data['hw1_c'][1], j_ids // data['hw1_c'][1]], + dim=1) * scale1 + + # These matches is the current prediction (for visualization) + coarse_matches.update({ + 'gt_mask': mconf == 0, + 'm_bids': b_ids[mconf != 0], # mconf == 0 => gt matches + 'mkpts0_c': mkpts0_c[mconf != 0], + 'mkpts1_c': mkpts1_c[mconf != 0], + 'mconf': mconf[mconf != 0] + }) + + return coarse_matches diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/cvpr_ds_config.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/cvpr_ds_config.py new file mode 100644 index 000000000..1c9ce7015 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/cvpr_ds_config.py @@ -0,0 +1,50 @@ +from yacs.config import CfgNode as CN + + +def lower_config(yacs_cfg): + if not isinstance(yacs_cfg, CN): + return yacs_cfg + return {k.lower(): lower_config(v) for k, v in yacs_cfg.items()} + + +_CN = CN() +_CN.BACKBONE_TYPE = 'ResNetFPN' +_CN.RESOLUTION = (8, 2) # options: [(8, 2), (16, 4)] +_CN.FINE_WINDOW_SIZE = 5 # window_size in fine_level, must be odd +_CN.FINE_CONCAT_COARSE_FEAT = True + +# 1. LoFTR-backbone (local feature CNN) config +_CN.RESNETFPN = CN() +_CN.RESNETFPN.INITIAL_DIM = 128 +_CN.RESNETFPN.BLOCK_DIMS = [128, 196, 256] # s1, s2, s3 + +# 2. LoFTR-coarse module config +_CN.COARSE = CN() +_CN.COARSE.D_MODEL = 256 +_CN.COARSE.D_FFN = 256 +_CN.COARSE.NHEAD = 8 +_CN.COARSE.LAYER_NAMES = ['self', 'cross'] * 4 +_CN.COARSE.ATTENTION = 'linear' # options: ['linear', 'full'] +_CN.COARSE.TEMP_BUG_FIX = False + +# 3. Coarse-Matching config +_CN.MATCH_COARSE = CN() +_CN.MATCH_COARSE.THR = 0.2 +_CN.MATCH_COARSE.BORDER_RM = 2 +_CN.MATCH_COARSE.MATCH_TYPE = 'dual_softmax' # options: ['dual_softmax, 'sinkhorn'] +_CN.MATCH_COARSE.DSMAX_TEMPERATURE = 0.1 +_CN.MATCH_COARSE.SKH_ITERS = 3 +_CN.MATCH_COARSE.SKH_INIT_BIN_SCORE = 1.0 +_CN.MATCH_COARSE.SKH_PREFILTER = True +_CN.MATCH_COARSE.TRAIN_COARSE_PERCENT = 0.4 # training tricks: save GPU memory +_CN.MATCH_COARSE.TRAIN_PAD_NUM_GT_MIN = 200 # training tricks: avoid DDP deadlock + +# 4. LoFTR-fine module config +_CN.FINE = CN() +_CN.FINE.D_MODEL = 128 +_CN.FINE.D_FFN = 128 +_CN.FINE.NHEAD = 8 +_CN.FINE.LAYER_NAMES = ['self', 'cross'] * 1 +_CN.FINE.ATTENTION = 'linear' + +default_cfg = lower_config(_CN) diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py new file mode 100644 index 000000000..35903212d --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/fine_matching.py @@ -0,0 +1,171 @@ +import math + +import torch +import torch.nn as nn + + +def create_meshgrid( + height: int, + width: int, + normalized_coordinates: bool = True, + device=None, + dtype=None, +): + """Generate a coordinate grid for an image. + + When the flag ``normalized_coordinates`` is set to True, the grid is + normalized to be in the range :math:`[-1,1]` to be consistent with the pytorch + function :py:func:`torch.nn.functional.grid_sample`. + + Args: + height: the image height (rows). + width: the image width (cols). + normalized_coordinates: whether to normalize + coordinates in the range :math:`[-1,1]` in order to be consistent with the + PyTorch function :py:func:`torch.nn.functional.grid_sample`. + device: the device on which the grid will be generated. + dtype: the data type of the generated grid. + + Return: + grid tensor with shape :math:`(1, H, W, 2)`. + + Example: + >>> create_meshgrid(2, 2) + tensor([[[[-1., -1.], + [ 1., -1.]], + + [[-1., 1.], + [ 1., 1.]]]]) + + >>> create_meshgrid(2, 2, normalized_coordinates=False) + tensor([[[[0., 0.], + [1., 0.]], + + [[0., 1.], + [1., 1.]]]]) + """ + xs = torch.linspace(0, width - 1, width, device=device, dtype=dtype) + ys = torch.linspace(0, height - 1, height, device=device, dtype=dtype) + if normalized_coordinates: + xs = (xs / (width - 1) - 0.5) * 2 + ys = (ys / (height - 1) - 0.5) * 2 + base_grid = torch.stack( + torch.meshgrid([xs, ys], indexing='ij'), dim=-1) # WxHx2 + return base_grid.permute(1, 0, 2).unsqueeze(0) # 1xHxWx2 + + +def spatial_expectation2d(input, normalized_coordinates: bool = True): + r"""Compute the expectation of coordinate values using spatial probabilities. + + The input heatmap is assumed to represent a valid spatial probability distribution, + which can be achieved using :func:`~kornia.geometry.subpixel.spatial_softmax2d`. + + Args: + input: the input tensor representing dense spatial probabilities with shape :math:`(B, N, H, W)`. + normalized_coordinates: whether to return the coordinates normalized in the range + of :math:`[-1, 1]`. Otherwise, it will return the coordinates in the range of the input shape. + + Returns: + expected value of the 2D coordinates with shape :math:`(B, N, 2)`. Output order of the coordinates is (x, y). + + Examples: + >>> heatmaps = torch.tensor([[[ + ... [0., 0., 0.], + ... [0., 0., 0.], + ... [0., 1., 0.]]]]) + >>> spatial_expectation2d(heatmaps, False) + tensor([[[1., 2.]]]) + """ + + batch_size, channels, height, width = input.shape + + # Create coordinates grid. + grid = create_meshgrid(height, width, normalized_coordinates, input.device) + grid = grid.to(input.dtype) + + pos_x = grid[..., 0].reshape(-1) + pos_y = grid[..., 1].reshape(-1) + + input_flat = input.view(batch_size, channels, -1) + + # Compute the expectation of the coordinates. + expected_y = torch.sum(pos_y * input_flat, -1, keepdim=True) + expected_x = torch.sum(pos_x * input_flat, -1, keepdim=True) + + output = torch.cat([expected_x, expected_y], -1) + + return output.view(batch_size, channels, 2) # BxNx2 + + +class FineMatching(nn.Module): + """FineMatching with s2d paradigm""" + + def __init__(self): + super().__init__() + + def forward(self, feat_f0, feat_f1, data): + """ + Args: + feat0 (torch.Tensor): [M, WW, C] + feat1 (torch.Tensor): [M, WW, C] + data (dict) + Update: + data (dict):{ + 'expec_f' (torch.Tensor): [M, 3], + 'mkpts0_f' (torch.Tensor): [M, 2], + 'mkpts1_f' (torch.Tensor): [M, 2]} + """ + M, WW, C = feat_f0.shape + W = int(math.sqrt(WW)) + scale = data['hw0_i'][0] / data['hw0_f'][0] + self.M, self.W, self.WW, self.C, self.scale = M, W, WW, C, scale + + # corner case: if no coarse matches found + if M == 0: + assert self.training is False, 'M is always >0, when training, see coarse_matching.py' + # logger.warning('No matches found in coarse-level.') + data.update({ + 'expec_f': torch.empty(0, 3, device=feat_f0.device), + 'mkpts0_f': data['mkpts0_c'], + 'mkpts1_f': data['mkpts1_c'], + }) + return + + feat_f0_picked = feat_f0_picked = feat_f0[:, WW // 2, :] + sim_matrix = torch.einsum('mc,mrc->mr', feat_f0_picked, feat_f1) + softmax_temp = 1. / C**.5 + heatmap = torch.softmax( + softmax_temp * sim_matrix, dim=1).view(-1, W, W) + + # compute coordinates from heatmap + coords_normalized = spatial_expectation2d(heatmap[None], + True)[0] # [M, 2] + grid_normalized = create_meshgrid(W, W, True, heatmap.device).reshape( + 1, -1, 2) # [1, WW, 2] + + # compute std over + var = torch.sum( + grid_normalized**2 * heatmap.view(-1, WW, 1), + dim=1) - coords_normalized**2 # [M, 2] + std = torch.sum(torch.sqrt(torch.clamp(var, min=1e-10)), + -1) # [M] clamp needed for numerical stability + + # for fine-level supervision + data.update( + {'expec_f': + torch.cat([coords_normalized, std.unsqueeze(1)], -1)}) + + # compute absolute kpt coords + self.get_fine_match(coords_normalized, data) + + @torch.no_grad() + def get_fine_match(self, coords_normed, data): + W, _, _, scale = self.W, self.WW, self.C, self.scale + + # mkpts0_f and mkpts1_f + mkpts0_f = data['mkpts0_c'] + scale1 = scale * data['scale1'][ + data['b_ids']] if 'scale0' in data else scale + mkpts1_f = data['mkpts1_c'] + (coords_normed * (W // 2) * scale1)[:len(data['mconf'])] # yapf: disable + + data.update({'mkpts0_f': mkpts0_f, 'mkpts1_f': mkpts1_f}) diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py new file mode 100644 index 000000000..214a3a7af --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/geometry.py @@ -0,0 +1,57 @@ +import torch + + +@torch.no_grad() +def warp_kpts(kpts0, depth0, depth1, T_0to1, K0, K1): + """ Warp kpts0 from I0 to I1 with depth, K and Rt + Also check covisibility and depth consistency. + Depth is consistent if relative error < 0.2 (hard-coded). + + Args: + kpts0 (torch.Tensor): [N, L, 2] - , + depth0 (torch.Tensor): [N, H, W], + depth1 (torch.Tensor): [N, H, W], + T_0to1 (torch.Tensor): [N, 3, 4], + K0 (torch.Tensor): [N, 3, 3], + K1 (torch.Tensor): [N, 3, 3], + Returns: + calculable_mask (torch.Tensor): [N, L] + warped_keypoints0 (torch.Tensor): [N, L, 2] + """ + kpts0_long = kpts0.round().long() + + # Sample depth, get calculable_mask on depth != 0 + kpts0_depth = torch.stack([ + depth0[i, kpts0_long[i, :, 1], kpts0_long[i, :, 0]] + for i in range(kpts0.shape[0]) + ], + dim=0) # noqa E501 + nonzero_mask = kpts0_depth != 0 + + # Unproject + kpts0_h = torch.cat([kpts0, torch.ones_like(kpts0[:, :, [0]])], + dim=-1) * kpts0_depth[..., None] # (N, L, 3) + kpts0_cam = K0.inverse() @ kpts0_h.transpose(2, 1) # (N, 3, L) + + # Rigid Transform + w_kpts0_cam = T_0to1[:, :3, :3] @ kpts0_cam + T_0to1[:, :3, + [3]] # (N, 3, L) + w_kpts0_depth_computed = w_kpts0_cam[:, 2, :] + + # Project + w_kpts0_h = (K1 @ w_kpts0_cam).transpose(2, 1) # (N, L, 3) + w_kpts0 = w_kpts0_h[:, :, :2] / (w_kpts0_h[:, :, [2]] + 1e-4 + ) # (N, L, 2), +1e-4 to avoid zero depth + + # Covisible Check + h, w = depth1.shape[1:3] + covisible_mask = (w_kpts0[:, :, 0] > 0) * (w_kpts0[:, :, 0] < w - 1) * (w_kpts0[:, :, 1] > 0) * (w_kpts0[:, :, 1] < h - 1) # noqa E501 yapf: disable + w_kpts0_long = w_kpts0.long() + w_kpts0_long[~covisible_mask, :] = 0 + + w_kpts0_depth = torch.stack([depth1[i, w_kpts0_long[i, :, 1], w_kpts0_long[i, :, 0]] for i in range(w_kpts0_long.shape[0])], dim=0) # noqa E501 yapf: disable + consistent_mask = ( + (w_kpts0_depth - w_kpts0_depth_computed) / w_kpts0_depth).abs() < 0.2 + valid_mask = nonzero_mask * covisible_mask * consistent_mask + + return valid_mask, w_kpts0 diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py new file mode 100644 index 000000000..c5e7355d8 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/position_encoding.py @@ -0,0 +1,44 @@ +import math + +import torch +from torch import nn + + +class PositionEncodingSine(nn.Module): + """ + This is a sinusoidal position encoding that generalized to 2-dimensional images + """ + + def __init__(self, d_model, max_shape=(256, 256), temp_bug_fix=True): + """ + Args: + max_shape (tuple): for 1/8 featmap, the max length of 256 corresponds to 2048 pixels + temp_bug_fix (bool): As noted in this [issue](https://github.com/zju3dv/LoFTR/issues/41), + the original implementation of LoFTR includes a bug in the pos-enc impl, which has little impact + on the final performance. For now, we keep both impls for backward compatability. + We will remove the buggy impl after re-training all variants of our released models. + """ + super().__init__() + + pe = torch.zeros((d_model, *max_shape)) + y_position = torch.ones(max_shape).cumsum(0).float().unsqueeze(0) + x_position = torch.ones(max_shape).cumsum(1).float().unsqueeze(0) + if temp_bug_fix: + div_term = torch.exp(torch.arange(0, d_model // 2, 2).float() * (-math.log(10000.0) / (d_model // 2))) # noqa E501 yapf: disable + else: # a buggy implementation (for backward compatability only) + div_term = torch.exp(torch.arange(0, d_model // 2, 2).float() * (-math.log(10000.0) / d_model // 2)) # noqa E501 yapf: disable + div_term = div_term[:, None, None] # [C//4, 1, 1] + pe[0::4, :, :] = torch.sin(x_position * div_term) + pe[1::4, :, :] = torch.cos(x_position * div_term) + pe[2::4, :, :] = torch.sin(y_position * div_term) + pe[3::4, :, :] = torch.cos(y_position * div_term) + + self.register_buffer( + 'pe', pe.unsqueeze(0), persistent=False) # [1, C, H, W] + + def forward(self, x): + """ + Args: + x: [N, C, H, W] + """ + return x + self.pe[:, :, :x.size(2), :x.size(3)] diff --git a/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py new file mode 100644 index 000000000..02d25d05d --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/loftr/utils/supervision.py @@ -0,0 +1,160 @@ +from math import log + +import torch +from einops import repeat +from kornia.utils import create_meshgrid +from loguru import logger + +from .geometry import warp_kpts + +# ↓ Coarse-Level supervision ↓ ############## + + +@torch.no_grad() +def mask_pts_at_padded_regions(grid_pt, mask): + """For megadepth dataset, zero-padding exists in images""" + mask = repeat(mask, 'n h w -> n (h w) c', c=2) + grid_pt[~mask.bool()] = 0 + return grid_pt + + +@torch.no_grad() +def spvs_coarse(data, config): + """ + Update: + data (dict): { + "conf_matrix_gt": [N, hw0, hw1], + 'spv_b_ids': [M] + 'spv_i_ids': [M] + 'spv_j_ids': [M] + 'spv_w_pt0_i': [N, hw0, 2], in original image resolution + 'spv_pt1_i': [N, hw1, 2], in original image resolution + } + + NOTE: + - for scannet dataset, there're 3 kinds of resolution {i, c, f} + - for megadepth dataset, there're 4 kinds of resolution {i, i_resize, c, f} + """ + # 1. misc + device = data['image0'].device + N, _, H0, W0 = data['image0'].shape + _, _, H1, W1 = data['image1'].shape + scale = config['LOFTR']['RESOLUTION'][0] + scale0 = scale * data['scale0'][:, None] if 'scale0' in data else scale + scale1 = scale * data['scale1'][:, None] if 'scale1' in data else scale + h0, w0, h1, w1 = map(lambda x: x // scale, [H0, W0, H1, W1]) + + # 2. warp grids + # create kpts in meshgrid and resize them to image resolution + grid_pt0_c = create_meshgrid(h0, w0, False, + device).reshape(1, h0 * w0, + 2).repeat(N, 1, + 1) # [N, hw, 2] + grid_pt0_i = scale0 * grid_pt0_c + grid_pt1_c = create_meshgrid(h1, w1, False, + device).reshape(1, h1 * w1, + 2).repeat(N, 1, 1) + grid_pt1_i = scale1 * grid_pt1_c + + # mask padded region to (0, 0), so no need to manually mask conf_matrix_gt + if 'mask0' in data: + grid_pt0_i = mask_pts_at_padded_regions(grid_pt0_i, data['mask0']) + grid_pt1_i = mask_pts_at_padded_regions(grid_pt1_i, data['mask1']) + + # warp kpts bi-directionally and resize them to coarse-level resolution + # (no depth consistency check, since it leads to worse results experimentally) + # (unhandled edge case: points with 0-depth will be warped to the left-up corner) + _, w_pt0_i = warp_kpts(grid_pt0_i, data['depth0'], data['depth1'], + data['T_0to1'], data['K0'], data['K1']) + _, w_pt1_i = warp_kpts(grid_pt1_i, data['depth1'], data['depth0'], + data['T_1to0'], data['K1'], data['K0']) + w_pt0_c = w_pt0_i / scale1 + w_pt1_c = w_pt1_i / scale0 + + # 3. check if mutual nearest neighbor + w_pt0_c_round = w_pt0_c[:, :, :].round().long() + nearest_index1 = w_pt0_c_round[..., 0] + w_pt0_c_round[..., 1] * w1 + w_pt1_c_round = w_pt1_c[:, :, :].round().long() + nearest_index0 = w_pt1_c_round[..., 0] + w_pt1_c_round[..., 1] * w0 + + # corner case: out of boundary + def out_bound_mask(pt, w, h): + return (pt[..., 0] < 0) + (pt[..., 0] >= w) + (pt[..., 1] < 0) + ( + pt[..., 1] >= h) + + nearest_index1[out_bound_mask(w_pt0_c_round, w1, h1)] = 0 + nearest_index0[out_bound_mask(w_pt1_c_round, w0, h0)] = 0 + + loop_back = torch.stack( + [nearest_index0[_b][_i] for _b, _i in enumerate(nearest_index1)], + dim=0) + correct_0to1 = loop_back == torch.arange( + h0 * w0, device=device)[None].repeat(N, 1) + correct_0to1[:, 0] = False # ignore the top-left corner + + # 4. construct a gt conf_matrix + conf_matrix_gt = torch.zeros(N, h0 * w0, h1 * w1, device=device) + b_ids, i_ids = torch.where(correct_0to1 != 0) + j_ids = nearest_index1[b_ids, i_ids] + + conf_matrix_gt[b_ids, i_ids, j_ids] = 1 + data.update({'conf_matrix_gt': conf_matrix_gt}) + + # 5. save coarse matches(gt) for training fine level + if len(b_ids) == 0: + logger.warning( + f"No groundtruth coarse match found for: {data['pair_names']}") + # this won't affect fine-level loss calculation + b_ids = torch.tensor([0], device=device) + i_ids = torch.tensor([0], device=device) + j_ids = torch.tensor([0], device=device) + + data.update({'spv_b_ids': b_ids, 'spv_i_ids': i_ids, 'spv_j_ids': j_ids}) + + # 6. save intermediate results (for fast fine-level computation) + data.update({'spv_w_pt0_i': w_pt0_i, 'spv_pt1_i': grid_pt1_i}) + + +def compute_supervision_coarse(data, config): + assert len(set( + data['dataset_name'])) == 1, 'Do not support mixed datasets training!' + data_source = data['dataset_name'][0] + if data_source.lower() in ['scannet', 'megadepth']: + spvs_coarse(data, config) + else: + raise ValueError(f'Unknown data source: {data_source}') + + +# ↓ Fine-Level supervision ↓ ############## + + +@torch.no_grad() +def spvs_fine(data, config): + """ + Update: + data (dict):{ + "expec_f_gt": [M, 2]} + """ + # 1. misc + # w_pt0_i, pt1_i = data.pop('spv_w_pt0_i'), data.pop('spv_pt1_i') + w_pt0_i, pt1_i = data['spv_w_pt0_i'], data['spv_pt1_i'] + scale = config['LOFTR']['RESOLUTION'][1] + radius = config['LOFTR']['FINE_WINDOW_SIZE'] // 2 + + # 2. get coarse prediction + b_ids, i_ids, j_ids = data['b_ids'], data['i_ids'], data['j_ids'] + + # 3. compute gt + scale = scale * data['scale1'][b_ids] if 'scale0' in data else scale + # `expec_f_gt` might exceed the window, i.e. abs(*) > 1, which would be filtered later + expec_f_gt = (w_pt0_i[b_ids, i_ids] + - pt1_i[b_ids, j_ids]) / scale / radius # [M, 2] + data.update({'expec_f_gt': expec_f_gt}) + + +def compute_supervision_fine(data, config): + data_source = data['dataset_name'][0] + if data_source.lower() in ['scannet', 'megadepth']: + spvs_fine(data, config) + else: + raise NotImplementedError diff --git a/modelscope/models/cv/image_local_feature_matching/src/utils/__init__.py b/modelscope/models/cv/image_local_feature_matching/src/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py b/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py new file mode 100644 index 000000000..206f90374 --- /dev/null +++ b/modelscope/models/cv/image_local_feature_matching/src/utils/plotting.py @@ -0,0 +1,177 @@ +import bisect + +import matplotlib +import matplotlib.pyplot as plt +import numpy as np + + +def _compute_conf_thresh(data): + dataset_name = data['dataset_name'][0].lower() + if dataset_name == 'scannet': + thr = 5e-4 + elif dataset_name == 'megadepth': + thr = 1e-4 + else: + raise ValueError(f'Unknown dataset: {dataset_name}') + return thr + + +# --- VISUALIZATION --- # + + +def make_matching_figure(img0, + img1, + mkpts0, + mkpts1, + color, + kpts0=None, + kpts1=None, + text=[], + dpi=75, + path=None): + # draw image pair + assert mkpts0.shape[0] == mkpts1.shape[ + 0], f'mkpts0: {mkpts0.shape[0]} v.s. mkpts1: {mkpts1.shape[0]}' + fig, axes = plt.subplots(1, 2, figsize=(10, 6), dpi=dpi) + axes[0].imshow(img0, cmap='gray') + axes[1].imshow(img1, cmap='gray') + for i in range(2): # clear all frames + axes[i].get_yaxis().set_ticks([]) + axes[i].get_xaxis().set_ticks([]) + for spine in axes[i].spines.values(): + spine.set_visible(False) + plt.tight_layout(pad=1) + + if kpts0 is not None: + assert kpts1 is not None + axes[0].scatter(kpts0[:, 0], kpts0[:, 1], c='w', s=2) + axes[1].scatter(kpts1[:, 0], kpts1[:, 1], c='w', s=2) + + # draw matches + if mkpts0.shape[0] != 0 and mkpts1.shape[0] != 0: + fig.canvas.draw() + transFigure = fig.transFigure.inverted() + fkpts0 = transFigure.transform(axes[0].transData.transform(mkpts0)) + fkpts1 = transFigure.transform(axes[1].transData.transform(mkpts1)) + fig.lines = [ + matplotlib.lines.Line2D((fkpts0[i, 0], fkpts1[i, 0]), + (fkpts0[i, 1], fkpts1[i, 1]), + transform=fig.transFigure, + c=color[i], + linewidth=1) for i in range(len(mkpts0)) + ] + + axes[0].scatter(mkpts0[:, 0], mkpts0[:, 1], c=color, s=4) + axes[1].scatter(mkpts1[:, 0], mkpts1[:, 1], c=color, s=4) + + # put txts + txt_color = 'k' if img0[:100, :200].mean() > 200 else 'w' + fig.text( + 0.01, + 0.99, + '\n'.join(text), + transform=fig.axes[0].transAxes, + fontsize=15, + va='top', + ha='left', + color=txt_color) + + # save or return figure + if path: + plt.savefig(str(path), bbox_inches='tight', pad_inches=0) + plt.close() + else: + return fig + + +def _make_evaluation_figure(data, b_id, alpha='dynamic'): + b_mask = data['m_bids'] == b_id + conf_thr = _compute_conf_thresh(data) + + img0 = (data['image0'][b_id][0].cpu().numpy() * 255).round().astype( + np.int32) + img1 = (data['image1'][b_id][0].cpu().numpy() * 255).round().astype( + np.int32) + kpts0 = data['mkpts0_f'][b_mask].cpu().numpy() + kpts1 = data['mkpts1_f'][b_mask].cpu().numpy() + + # for megadepth, we visualize matches on the resized image + if 'scale0' in data: + kpts0 = kpts0 / data['scale0'][b_id].cpu().numpy()[[1, 0]] + kpts1 = kpts1 / data['scale1'][b_id].cpu().numpy()[[1, 0]] + + epi_errs = data['epi_errs'][b_mask].cpu().numpy() + correct_mask = epi_errs < conf_thr + precision = np.mean(correct_mask) if len(correct_mask) > 0 else 0 + n_correct = np.sum(correct_mask) + n_gt_matches = int(data['conf_matrix_gt'][b_id].sum().cpu()) + recall = 0 if n_gt_matches == 0 else n_correct / (n_gt_matches) + # recall might be larger than 1, since the calculation of conf_matrix_gt + # uses groundtruth depths and camera poses, but epipolar distance is used here. + + # matching info + if alpha == 'dynamic': + alpha = dynamic_alpha(len(correct_mask)) + color = error_colormap(epi_errs, conf_thr, alpha=alpha) + + text = [ + f'#Matches {len(kpts0)}', + f'Precision({conf_thr:.2e}) ({100 * precision:.1f}%): {n_correct}/{len(kpts0)}', + f'Recall({conf_thr:.2e}) ({100 * recall:.1f}%): {n_correct}/{n_gt_matches}' + ] + + # make the figure + figure = make_matching_figure(img0, img1, kpts0, kpts1, color, text=text) + return figure + + +def _make_confidence_figure(data, b_id): + # TODO: Implement confidence figure + raise NotImplementedError() + + +def make_matching_figures(data, config, mode='evaluation'): + """ Make matching figures for a batch. + + Args: + data (Dict): a batch updated by PL_LoFTR. + config (Dict): matcher config + Returns: + figures (Dict[str, List[plt.figure]] + """ + assert mode in ['evaluation', 'confidence'] # 'confidence' + figures = {mode: []} + for b_id in range(data['image0'].size(0)): + if mode == 'evaluation': + fig = _make_evaluation_figure( + data, b_id, alpha=config.TRAINER.PLOT_MATCHES_ALPHA) + elif mode == 'confidence': + fig = _make_confidence_figure(data, b_id) + else: + raise ValueError(f'Unknown plot mode: {mode}') + figures[mode].append(fig) + return figures + + +def dynamic_alpha(n_matches, + milestones=[0, 300, 1000, 2000], + alphas=[1.0, 0.8, 0.4, 0.2]): + if n_matches == 0: + return 1.0 + ranges = list(zip(alphas, alphas[1:] + [None])) + loc = bisect.bisect_right(milestones, n_matches) - 1 + _range = ranges[loc] + if _range[1] is None: + return _range[0] + return _range[1] + (milestones[loc + 1] - n_matches) / ( + milestones[loc + 1] - milestones[loc]) * ( + _range[0] - _range[1]) + + +def error_colormap(err, thr, alpha=1.0): + assert alpha <= 1.0 and alpha > 0, f'Invaid alpha value: {alpha}' + x = 1 - np.clip(err / (thr * 2), 0, 1) + return np.clip( + np.stack([2 - x * 2, x * 2, + np.zeros_like(x), + np.ones_like(x) * alpha], -1), 0, 1) diff --git a/modelscope/models/cv/image_matching_fast/__init__.py b/modelscope/models/cv/image_matching_fast/__init__.py new file mode 100644 index 000000000..ced7bc449 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/__init__.py @@ -0,0 +1,24 @@ +# The implementation is made publicly available under the +# Apache 2.0 license at https://github.com/cvg/LightGlue + +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .lightglue_model import LightGlueImageMatching + +else: + _import_structure = { + 'lightglue_model': ['LightGlueImageMatching'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_matching_fast/config/__init__.py b/modelscope/models/cv/image_matching_fast/config/__init__.py new file mode 100644 index 000000000..84c52f690 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/config/__init__.py @@ -0,0 +1 @@ +from .default import lightglue_default_conf diff --git a/modelscope/models/cv/image_matching_fast/config/default.py b/modelscope/models/cv/image_matching_fast/config/default.py new file mode 100644 index 000000000..0100b96c9 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/config/default.py @@ -0,0 +1,15 @@ +lightglue_default_conf = { + 'features': 'superpoint', # superpoint disk aliked sift + 'name': 'lightglue', # just for interfacing + 'input_dim': 256, # input descriptor dimension (autoselected from weights) + 'descriptor_dim': 256, + 'add_scale_ori': False, + 'n_layers': 9, + 'num_heads': 4, + 'flash': True, # enable FlashAttention if available. + 'mp': False, # enable mixed precision + 'depth_confidence': 0.95, # early stopping, disable with -1 + 'width_confidence': 0.99, # point pruning, disable with -1 + 'filter_threshold': 0.1, # match threshold + 'weights': None, +} diff --git a/modelscope/models/cv/image_matching_fast/lightglue/__init__.py b/modelscope/models/cv/image_matching_fast/lightglue/__init__.py new file mode 100644 index 000000000..42719c9d5 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/__init__.py @@ -0,0 +1,6 @@ +from .aliked import ALIKED # noqa +from .disk import DISK # noqa +from .lightglue import LightGlue # noqa +from .sift import SIFT # noqa +from .superpoint import SuperPoint # noqa +from .utils import match_pair # noqa diff --git a/modelscope/models/cv/image_matching_fast/lightglue/aliked.py b/modelscope/models/cv/image_matching_fast/lightglue/aliked.py new file mode 100644 index 000000000..71ff4f95e --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/aliked.py @@ -0,0 +1,762 @@ +# BSD 3-Clause License + +# Copyright (c) 2022, Zhao Xiaoming +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# 1. Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# 2. Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# 3. Neither the name of the copyright holder nor the names of its +# contributors may be used to endorse or promote products derived from +# this software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# Authors: +# Xiaoming Zhao, Xingming Wu, Weihai Chen, Peter C.Y. Chen, Qingsong Xu, and Zhengguo Li +# Code from https://github.com/Shiaoming/ALIKED + +from typing import Callable, Optional + +import torch +import torch.nn.functional as F +import torchvision +from kornia.color import grayscale_to_rgb +from torch import nn +from torch.nn.modules.utils import _pair +from torchvision.models import resnet + +from .utils import Extractor + + +def get_patches(tensor: torch.Tensor, required_corners: torch.Tensor, + ps: int) -> torch.Tensor: + c, h, w = tensor.shape + corner = (required_corners - ps / 2 + 1).long() + corner[:, 0] = corner[:, 0].clamp(min=0, max=w - 1 - ps) + corner[:, 1] = corner[:, 1].clamp(min=0, max=h - 1 - ps) + offset = torch.arange(0, ps) + + kw = {'indexing': 'ij'} if torch.__version__ >= '1.10' else {} + x, y = torch.meshgrid(offset, offset, **kw) + patches = torch.stack((x, y)).permute(2, 1, 0).unsqueeze(2) + patches = patches.to(corner) + corner[None, None] + pts = patches.reshape(-1, 2) + sampled = tensor.permute(1, 2, 0)[tuple(pts.T)[::-1]] + sampled = sampled.reshape(ps, ps, -1, c) + assert sampled.shape[:3] == patches.shape[:3] + return sampled.permute(2, 3, 0, 1) + + +def simple_nms(scores: torch.Tensor, nms_radius: int): + """Fast Non-maximum suppression to remove nearby points""" + + zeros = torch.zeros_like(scores) + max_mask = scores == torch.nn.functional.max_pool2d( + scores, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius) + + for _ in range(2): + supp_mask = ( + torch.nn.functional.max_pool2d( + max_mask.float(), + kernel_size=nms_radius * 2 + 1, + stride=1, + padding=nms_radius, + ) > 0) + supp_scores = torch.where(supp_mask, zeros, scores) + new_max_mask = supp_scores == torch.nn.functional.max_pool2d( + supp_scores, + kernel_size=nms_radius * 2 + 1, + stride=1, + padding=nms_radius) + max_mask = max_mask | (new_max_mask & (~supp_mask)) + return torch.where(max_mask, scores, zeros) + + +class DKD(nn.Module): + + def __init__( + self, + radius: int = 2, + top_k: int = 0, + scores_th: float = 0.2, + n_limit: int = 20000, + ): + """ + Args: + radius: soft detection radius, kernel size is (2 * radius + 1) + top_k: top_k > 0: return top k keypoints + scores_th: top_k <= 0 threshold mode: + scores_th > 0: return keypoints with scores>scores_th + else: return keypoints with scores > scores.mean() + n_limit: max number of keypoint in threshold mode + """ + super().__init__() + self.radius = radius + self.top_k = top_k + self.scores_th = scores_th + self.n_limit = n_limit + self.kernel_size = 2 * self.radius + 1 + self.temperature = 0.1 # tuned temperature + self.unfold = nn.Unfold( + kernel_size=self.kernel_size, padding=self.radius) + # local xy grid + x = torch.linspace(-self.radius, self.radius, self.kernel_size) + # (kernel_size*kernel_size) x 2 : (w,h) + kw = {'indexing': 'ij'} if torch.__version__ >= '1.10' else {} + self.hw_grid = ( + torch.stack(torch.meshgrid([x, x], **kw)).view(2, -1).t()[:, + [1, 0]]) + + def forward( + self, + scores_map: torch.Tensor, + sub_pixel: bool = True, + image_size: Optional[torch.Tensor] = None, + ): + """ + :param scores_map: Bx1xHxW + :param descriptor_map: BxCxHxW + :param sub_pixel: whether to use sub-pixel keypoint detection + :return: kpts: list[Nx2,...]; kptscores: list[N,....] normalised position: -1~1 + """ + b, c, h, w = scores_map.shape + scores_nograd = scores_map.detach() + nms_scores = simple_nms(scores_nograd, self.radius) + + # remove border + nms_scores[:, :, :self.radius, :] = 0 + nms_scores[:, :, :, :self.radius] = 0 + if image_size is not None: + for i in range(scores_map.shape[0]): + w, h = image_size[i].long() + nms_scores[i, :, h.item() - self.radius:, :] = 0 + nms_scores[i, :, :, w.item() - self.radius:] = 0 + else: + nms_scores[:, :, -self.radius:, :] = 0 + nms_scores[:, :, :, -self.radius:] = 0 + + # detect keypoints without grad + if self.top_k > 0: + topk = torch.topk(nms_scores.view(b, -1), self.top_k) + indices_keypoints = [topk.indices[i] + for i in range(b)] # B x top_k + else: + if self.scores_th > 0: + masks = nms_scores > self.scores_th + if masks.sum() == 0: + th = scores_nograd.reshape(b, -1).mean( + dim=1) # th = self.scores_th + masks = nms_scores > th.reshape(b, 1, 1, 1) + else: + th = scores_nograd.reshape(b, -1).mean( + dim=1) # th = self.scores_th + masks = nms_scores > th.reshape(b, 1, 1, 1) + masks = masks.reshape(b, -1) + + indices_keypoints = [] # list, B x (any size) + scores_view = scores_nograd.reshape(b, -1) + for mask, scores in zip(masks, scores_view): + indices = mask.nonzero()[:, 0] + if len(indices) > self.n_limit: + kpts_sc = scores[indices] + sort_idx = kpts_sc.sort(descending=True)[1] + sel_idx = sort_idx[:self.n_limit] + indices = indices[sel_idx] + indices_keypoints.append(indices) + + wh = torch.tensor([w - 1, h - 1], device=scores_nograd.device) + + keypoints = [] + scoredispersitys = [] + kptscores = [] + if sub_pixel: + # detect soft keypoints with grad backpropagation + patches = self.unfold(scores_map) # B x (kernel**2) x (H*W) + self.hw_grid = self.hw_grid.to(scores_map) # to device + for b_idx in range(b): + patch = patches[b_idx].t() # (H*W) x (kernel**2) + indices_kpt = indices_keypoints[ + b_idx] # one dimension vector, say its size is M + patch_scores = patch[indices_kpt] # M x (kernel**2) + keypoints_xy_nms = torch.stack( + [ + indices_kpt % w, + torch.div(indices_kpt, w, rounding_mode='trunc') + ], + dim=1, + ) # Mx2 + + # max is detached to prevent undesired backprop loops in the graph + max_v = patch_scores.max(dim=1).values.detach()[:, None] + x_exp = ( + (patch_scores - max_v) + / self.temperature).exp() # M * (kernel**2), in [0, 1] + + # \frac{ \sum{(i,j) \times \exp(x/T)} }{ \sum{\exp(x/T)} } + xy_residual = (x_exp @ self.hw_grid / x_exp.sum(dim=1)[:, None] + ) # Soft-argmax, Mx2 + + hw_grid_dist2 = ( + torch.norm( + (self.hw_grid[None, :, :] - xy_residual[:, None, :]) + / self.radius, + dim=-1, + )**2) + scoredispersity = (x_exp * hw_grid_dist2).sum( + dim=1) / x_exp.sum(dim=1) + + # compute result keypoints + keypoints_xy = keypoints_xy_nms + xy_residual + keypoints_xy = keypoints_xy / wh * 2 - 1 # (w,h) -> (-1~1,-1~1) + + kptscore = torch.nn.functional.grid_sample( + scores_map[b_idx].unsqueeze(0), + keypoints_xy.view(1, 1, -1, 2), + mode='bilinear', + align_corners=True, + )[0, 0, 0, :] # CxN + + keypoints.append(keypoints_xy) + scoredispersitys.append(scoredispersity) + kptscores.append(kptscore) + else: + for b_idx in range(b): + indices_kpt = indices_keypoints[ + b_idx] # one dimension vector, say its size is M + # To avoid warning: UserWarning: __floordiv__ is deprecated + keypoints_xy_nms = torch.stack( + [ + indices_kpt % w, + torch.div(indices_kpt, w, rounding_mode='trunc') + ], + dim=1, + ) # Mx2 + keypoints_xy = keypoints_xy_nms / wh * 2 - 1 # (w,h) -> (-1~1,-1~1) + kptscore = torch.nn.functional.grid_sample( + scores_map[b_idx].unsqueeze(0), + keypoints_xy.view(1, 1, -1, 2), + mode='bilinear', + align_corners=True, + )[0, 0, 0, :] # CxN + keypoints.append(keypoints_xy) + scoredispersitys.append( + kptscore) # for jit.script compatability + kptscores.append(kptscore) + + return keypoints, scoredispersitys, kptscores + + +class InputPadder(object): + """Pads images such that dimensions are divisible by 8""" + + def __init__(self, h: int, w: int, divis_by: int = 8): + self.ht = h + self.wd = w + pad_ht = (((self.ht // divis_by) + 1) * divis_by - self.ht) % divis_by + pad_wd = (((self.wd // divis_by) + 1) * divis_by - self.wd) % divis_by + self._pad = [ + pad_wd // 2, + pad_wd - pad_wd // 2, + pad_ht // 2, + pad_ht - pad_ht // 2, + ] + + def pad(self, x: torch.Tensor): + assert x.ndim == 4 + return F.pad(x, self._pad, mode='replicate') + + def unpad(self, x: torch.Tensor): + assert x.ndim == 4 + ht = x.shape[-2] + wd = x.shape[-1] + c = [self._pad[2], ht - self._pad[3], self._pad[0], wd - self._pad[1]] + return x[..., c[0]:c[1], c[2]:c[3]] + + +class DeformableConv2d(nn.Module): + + def __init__( + self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False, + mask=False, + ): + super(DeformableConv2d, self).__init__() + + self.padding = padding + self.mask = mask + + self.channel_num = (3 * kernel_size * kernel_size if mask else 2 + * kernel_size * kernel_size) + self.offset_conv = nn.Conv2d( + in_channels, + self.channel_num, + kernel_size=kernel_size, + stride=stride, + padding=self.padding, + bias=True, + ) + + self.regular_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=self.padding, + bias=bias, + ) + + def forward(self, x): + h, w = x.shape[2:] + max_offset = max(h, w) / 4.0 + + out = self.offset_conv(x) + if self.mask: + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + else: + offset = out + mask = None + offset = offset.clamp(-max_offset, max_offset) + x = torchvision.ops.deform_conv2d( + input=x, + offset=offset, + weight=self.regular_conv.weight, + bias=self.regular_conv.bias, + padding=self.padding, + mask=mask, + ) + return x + + +def get_conv( + inplanes, + planes, + kernel_size=3, + stride=1, + padding=1, + bias=False, + conv_type='conv', + mask=False, +): + if conv_type == 'conv': + conv = nn.Conv2d( + inplanes, + planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) + elif conv_type == 'dcn': + conv = DeformableConv2d( + inplanes, + planes, + kernel_size=kernel_size, + stride=stride, + padding=_pair(padding), + bias=bias, + mask=mask, + ) + else: + raise TypeError + return conv + + +class ConvBlock(nn.Module): + + def __init__( + self, + in_channels, + out_channels, + gate: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + conv_type: str = 'conv', + mask: bool = False, + ): + super().__init__() + if gate is None: + self.gate = nn.ReLU(inplace=True) + else: + self.gate = gate + if norm_layer is None: + norm_layer = nn.BatchNorm2d + self.conv1 = get_conv( + in_channels, + out_channels, + kernel_size=3, + conv_type=conv_type, + mask=mask) + self.bn1 = norm_layer(out_channels) + self.conv2 = get_conv( + out_channels, + out_channels, + kernel_size=3, + conv_type=conv_type, + mask=mask) + self.bn2 = norm_layer(out_channels) + + def forward(self, x): + x = self.gate(self.bn1(self.conv1(x))) # B x in_channels x H x W + x = self.gate(self.bn2(self.conv2(x))) # B x out_channels x H x W + return x + + +# modified based on torchvision\models\resnet.py#27->BasicBlock +class ResBlock(nn.Module): + expansion: int = 1 + + def __init__( + self, + inplanes: int, + planes: int, + stride: int = 1, + downsample: Optional[nn.Module] = None, + groups: int = 1, + base_width: int = 64, + dilation: int = 1, + gate: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + conv_type: str = 'conv', + mask: bool = False, + ) -> None: + super(ResBlock, self).__init__() + if gate is None: + self.gate = nn.ReLU(inplace=True) + else: + self.gate = gate + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if groups != 1 or base_width != 64: + raise ValueError( + 'ResBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError('Dilation > 1 not supported in ResBlock') + # Both self.conv1 and self.downsample layers + # downsample the input when stride != 1 + self.conv1 = get_conv( + inplanes, planes, kernel_size=3, conv_type=conv_type, mask=mask) + self.bn1 = norm_layer(planes) + self.conv2 = get_conv( + planes, planes, kernel_size=3, conv_type=conv_type, mask=mask) + self.bn2 = norm_layer(planes) + self.downsample = downsample + self.stride = stride + + def forward(self, x: torch.Tensor) -> torch.Tensor: + identity = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.gate(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.gate(out) + + return out + + +class SDDH(nn.Module): + + def __init__( + self, + dims: int, + kernel_size: int = 3, + n_pos: int = 8, + gate=nn.ReLU(), + conv2D=False, + mask=False, + ): + super(SDDH, self).__init__() + self.kernel_size = kernel_size + self.n_pos = n_pos + self.conv2D = conv2D + self.mask = mask + + self.get_patches_func = get_patches + + # estimate offsets + self.channel_num = 3 * n_pos if mask else 2 * n_pos + self.offset_conv = nn.Sequential( + nn.Conv2d( + dims, + self.channel_num, + kernel_size=kernel_size, + stride=1, + padding=0, + bias=True, + ), + gate, + nn.Conv2d( + self.channel_num, + self.channel_num, + kernel_size=1, + stride=1, + padding=0, + bias=True, + ), + ) + + # sampled feature conv + self.sf_conv = nn.Conv2d( + dims, dims, kernel_size=1, stride=1, padding=0, bias=False) + + # convM + if not conv2D: + # deformable desc weights + agg_weights = torch.nn.Parameter(torch.rand(n_pos, dims, dims)) + self.register_parameter('agg_weights', agg_weights) + else: + self.convM = nn.Conv2d( + dims * n_pos, + dims, + kernel_size=1, + stride=1, + padding=0, + bias=False) + + def forward(self, x, keypoints): + # x: [B,C,H,W] + # keypoints: list, [[N_kpts,2], ...] (w,h) + b, c, h, w = x.shape + wh = torch.tensor([[w - 1, h - 1]], device=x.device) + max_offset = max(h, w) / 4.0 + + offsets = [] + descriptors = [] + # get offsets for each keypoint + for ib in range(b): + xi, kptsi = x[ib], keypoints[ib] + kptsi_wh = (kptsi / 2 + 0.5) * wh + N_kpts = len(kptsi) + + if self.kernel_size > 1: + patch = self.get_patches_func( + xi, kptsi_wh.long(), self.kernel_size) # [N_kpts, C, K, K] + else: + kptsi_wh_long = kptsi_wh.long() + patch = ( + xi[:, kptsi_wh_long[:, 1], + kptsi_wh_long[:, + 0]].permute(1, + 0).reshape(N_kpts, c, 1, 1)) + + offset = self.offset_conv(patch).clamp( + -max_offset, max_offset) # [N_kpts, 2*n_pos, 1, 1] + if self.mask: + offset = (offset[:, :, 0, 0].view(N_kpts, 3, + self.n_pos).permute(0, 2, 1) + ) # [N_kpts, n_pos, 3] + offset = offset[:, :, :-1] # [N_kpts, n_pos, 2] + mask_weight = torch.sigmoid(offset[:, :, + -1]) # [N_kpts, n_pos] + else: + offset = (offset[:, :, 0, 0].view(N_kpts, 2, + self.n_pos).permute(0, 2, 1) + ) # [N_kpts, n_pos, 2] + offsets.append(offset) # for visualization + + # get sample positions + pos = kptsi_wh.unsqueeze(1) + offset # [N_kpts, n_pos, 2] + pos = 2.0 * pos / wh[None] - 1 + pos = pos.reshape(1, N_kpts * self.n_pos, 1, 2) + + # sample features + features = F.grid_sample( + xi.unsqueeze(0), pos, mode='bilinear', + align_corners=True) # [1,C,(N_kpts*n_pos),1] + features = features.reshape(c, N_kpts, self.n_pos, + 1).permute(1, 0, 2, + 3) # [N_kpts, C, n_pos, 1] + if self.mask: + features = torch.einsum('ncpo,np->ncpo', features, mask_weight) + + features = torch.selu_(self.sf_conv(features)).squeeze( + -1) # [N_kpts, C, n_pos] + # convM + if not self.conv2D: + descs = torch.einsum('ncp,pcd->nd', features, + self.agg_weights) # [N_kpts, C] + else: + features = features.reshape( + N_kpts, -1)[:, :, None, None] # [N_kpts, C*n_pos, 1, 1] + descs = self.convM(features).squeeze() # [N_kpts, C] + + # normalize + descs = F.normalize(descs, p=2.0, dim=1) + descriptors.append(descs) + + return descriptors, offsets + + +class ALIKED(Extractor): + default_conf = { + 'model_name': 'aliked-n16', + 'max_num_keypoints': -1, + 'detection_threshold': 0.2, + 'nms_radius': 2, + } + + checkpoint_url = 'https://github.com/Shiaoming/ALIKED/raw/main/models/{}.pth' + + n_limit_max = 20000 + + # c1, c2, c3, c4, dim, K, M + cfgs = { + 'aliked-t16': [8, 16, 32, 64, 64, 3, 16], + 'aliked-n16': [16, 32, 64, 128, 128, 3, 16], + 'aliked-n16rot': [16, 32, 64, 128, 128, 3, 16], + 'aliked-n32': [16, 32, 64, 128, 128, 3, 32], + } + preprocess_conf = { + 'resize': 1024, + } + + required_data_keys = ['image'] + + def __init__(self, **conf): + super().__init__(**conf) # Update with default configuration. + conf = self.conf + c1, c2, c3, c4, dim, K, M = self.cfgs[conf.model_name] + conv_types = ['conv', 'conv', 'dcn', 'dcn'] + conv2D = False + mask = False + + # build model + self.pool2 = nn.AvgPool2d(kernel_size=2, stride=2) + self.pool4 = nn.AvgPool2d(kernel_size=4, stride=4) + self.norm = nn.BatchNorm2d + self.gate = nn.SELU(inplace=True) + self.block1 = ConvBlock( + 3, c1, self.gate, self.norm, conv_type=conv_types[0]) + self.block2 = self.get_resblock(c1, c2, conv_types[1], mask) + self.block3 = self.get_resblock(c2, c3, conv_types[2], mask) + self.block4 = self.get_resblock(c3, c4, conv_types[3], mask) + + self.conv1 = resnet.conv1x1(c1, dim // 4) + self.conv2 = resnet.conv1x1(c2, dim // 4) + self.conv3 = resnet.conv1x1(c3, dim // 4) + self.conv4 = resnet.conv1x1(dim, dim // 4) + self.upsample2 = nn.Upsample( + scale_factor=2, mode='bilinear', align_corners=True) + self.upsample4 = nn.Upsample( + scale_factor=4, mode='bilinear', align_corners=True) + self.upsample8 = nn.Upsample( + scale_factor=8, mode='bilinear', align_corners=True) + self.upsample32 = nn.Upsample( + scale_factor=32, mode='bilinear', align_corners=True) + self.score_head = nn.Sequential( + resnet.conv1x1(dim, 8), + self.gate, + resnet.conv3x3(8, 4), + self.gate, + resnet.conv3x3(4, 4), + self.gate, + resnet.conv3x3(4, 1), + ) + self.desc_head = SDDH( + dim, K, M, gate=self.gate, conv2D=conv2D, mask=mask) + self.dkd = DKD( + radius=conf.nms_radius, + top_k=-1 + if conf.detection_threshold > 0 else conf.max_num_keypoints, + scores_th=conf.detection_threshold, + n_limit=conf.max_num_keypoints + if conf.max_num_keypoints > 0 else self.n_limit_max, + ) + + state_dict = torch.hub.load_state_dict_from_url( + self.checkpoint_url.format(conf.model_name), map_location='cpu') + self.load_state_dict(state_dict, strict=True) + + def get_resblock(self, c_in, c_out, conv_type, mask): + return ResBlock( + c_in, + c_out, + 1, + nn.Conv2d(c_in, c_out, 1), + gate=self.gate, + norm_layer=self.norm, + conv_type=conv_type, + mask=mask, + ) + + def extract_dense_map(self, image): + # Pads images such that dimensions are divisible by + div_by = 2**5 + padder = InputPadder(image.shape[-2], image.shape[-1], div_by) + image = padder.pad(image) + + # ================================== feature encoder + x1 = self.block1(image) # B x c1 x H x W + x2 = self.pool2(x1) + x2 = self.block2(x2) # B x c2 x H/2 x W/2 + x3 = self.pool4(x2) + x3 = self.block3(x3) # B x c3 x H/8 x W/8 + x4 = self.pool4(x3) + x4 = self.block4(x4) # B x dim x H/32 x W/32 + # ================================== feature aggregation + x1 = self.gate(self.conv1(x1)) # B x dim//4 x H x W + x2 = self.gate(self.conv2(x2)) # B x dim//4 x H//2 x W//2 + x3 = self.gate(self.conv3(x3)) # B x dim//4 x H//8 x W//8 + x4 = self.gate(self.conv4(x4)) # B x dim//4 x H//32 x W//32 + x2_up = self.upsample2(x2) # B x dim//4 x H x W + x3_up = self.upsample8(x3) # B x dim//4 x H x W + x4_up = self.upsample32(x4) # B x dim//4 x H x W + x1234 = torch.cat([x1, x2_up, x3_up, x4_up], dim=1) + # ================================== score head + score_map = torch.sigmoid(self.score_head(x1234)) + feature_map = torch.nn.functional.normalize(x1234, p=2, dim=1) + + # Unpads images + feature_map = padder.unpad(feature_map) + score_map = padder.unpad(score_map) + + return feature_map, score_map + + def forward(self, data: dict) -> dict: + image = data['image'] + if image.shape[1] == 1: + image = grayscale_to_rgb(image) + feature_map, score_map = self.extract_dense_map(image) + keypoints, kptscores, scoredispersitys = self.dkd( + score_map, image_size=data.get('image_size')) + descriptors, offsets = self.desc_head(feature_map, keypoints) + + _, _, h, w = image.shape + wh = torch.tensor([w - 1, h - 1], device=image.device) + # no padding required + # we can set detection_threshold=-1 and conf.max_num_keypoints > 0 + return { + 'keypoints': wh * (torch.stack(keypoints) + 1) / 2.0, # B x N x 2 + 'descriptors': torch.stack(descriptors), # B x N x D + 'keypoint_scores': torch.stack(kptscores), # B x N + } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/disk.py b/modelscope/models/cv/image_matching_fast/lightglue/disk.py new file mode 100644 index 000000000..08d521c44 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/disk.py @@ -0,0 +1,55 @@ +import kornia +import torch + +from .utils import Extractor + + +class DISK(Extractor): + default_conf = { + 'weights': 'depth', + 'max_num_keypoints': None, + 'desc_dim': 128, + 'nms_window_size': 5, + 'detection_threshold': 0.0, + 'pad_if_not_divisible': True, + } + + preprocess_conf = { + 'resize': 1024, + 'grayscale': False, + } + + required_data_keys = ['image'] + + def __init__(self, **conf) -> None: + super().__init__(**conf) # Update with default configuration. + self.model = kornia.feature.DISK.from_pretrained(self.conf.weights) + + def forward(self, data: dict) -> dict: + """Compute keypoints, scores, descriptors for image""" + for key in self.required_data_keys: + assert key in data, f'Missing key {key} in data' + image = data['image'] + if image.shape[1] == 1: + image = kornia.color.grayscale_to_rgb(image) + features = self.model( + image, + n=self.conf.max_num_keypoints, + window_size=self.conf.nms_window_size, + score_threshold=self.conf.detection_threshold, + pad_if_not_divisible=self.conf.pad_if_not_divisible, + ) + keypoints = [f.keypoints for f in features] + scores = [f.detection_scores for f in features] + descriptors = [f.descriptors for f in features] + del features + + keypoints = torch.stack(keypoints, 0) + scores = torch.stack(scores, 0) + descriptors = torch.stack(descriptors, 0) + + return { + 'keypoints': keypoints.to(image).contiguous(), + 'keypoint_scores': scores.to(image).contiguous(), + 'descriptors': descriptors.to(image).contiguous(), + } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py b/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py new file mode 100644 index 000000000..16888b556 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/lightglue.py @@ -0,0 +1,641 @@ +import os.path as osp +import warnings +from pathlib import Path +from types import SimpleNamespace +from typing import Callable, List, Optional, Tuple + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + +try: + from flash_attn.modules.mha import FlashCrossAttention +except ModuleNotFoundError: + FlashCrossAttention = None + +if FlashCrossAttention or hasattr(F, 'scaled_dot_product_attention'): + FLASH_AVAILABLE = True +else: + FLASH_AVAILABLE = False + +torch.backends.cudnn.deterministic = True + + +@torch.cuda.amp.custom_fwd(cast_inputs=torch.float32) +def normalize_keypoints(kpts: torch.Tensor, + size: Optional[torch.Tensor] = None) -> torch.Tensor: + if size is None: + size = 1 + kpts.max(-2).values - kpts.min(-2).values + elif not isinstance(size, torch.Tensor): + size = torch.tensor(size, device=kpts.device, dtype=kpts.dtype) + size = size.to(kpts) + shift = size / 2 + scale = size.max(-1).values / 2 + kpts = (kpts - shift[..., None, :]) / scale[..., None, None] + return kpts + + +def pad_to_length(x: torch.Tensor, length: int) -> Tuple[torch.Tensor]: + if length <= x.shape[-2]: + return x, torch.ones_like(x[..., :1], dtype=torch.bool) + pad = torch.ones( + *x.shape[:-2], + length - x.shape[-2], + x.shape[-1], + device=x.device, + dtype=x.dtype) + y = torch.cat([x, pad], dim=-2) + mask = torch.zeros(*y.shape[:-1], 1, dtype=torch.bool, device=x.device) + mask[..., :x.shape[-2], :] = True + return y, mask + + +def rotate_half(x: torch.Tensor) -> torch.Tensor: + x = x.unflatten(-1, (-1, 2)) + x1, x2 = x.unbind(dim=-1) + return torch.stack((-x2, x1), dim=-1).flatten(start_dim=-2) + + +def apply_cached_rotary_emb(freqs: torch.Tensor, + t: torch.Tensor) -> torch.Tensor: + return (t * freqs[0]) + (rotate_half(t) * freqs[1]) + + +class LearnableFourierPositionalEncoding(nn.Module): + + def __init__(self, + M: int, + dim: int, + F_dim: int = None, + gamma: float = 1.0) -> None: + super().__init__() + F_dim = F_dim if F_dim is not None else dim + self.gamma = gamma + self.Wr = nn.Linear(M, F_dim // 2, bias=False) + nn.init.normal_(self.Wr.weight.data, mean=0, std=self.gamma**-2) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """encode position vector""" + projected = self.Wr(x) + cosines, sines = torch.cos(projected), torch.sin(projected) + emb = torch.stack([cosines, sines], 0).unsqueeze(-3) + return emb.repeat_interleave(2, dim=-1) + + +class TokenConfidence(nn.Module): + + def __init__(self, dim: int) -> None: + super().__init__() + self.token = nn.Sequential(nn.Linear(dim, 1), nn.Sigmoid()) + + def forward(self, desc0: torch.Tensor, desc1: torch.Tensor): + """get confidence tokens""" + return ( + self.token(desc0.detach()).squeeze(-1), + self.token(desc1.detach()).squeeze(-1), + ) + + +class Attention(nn.Module): + + def __init__(self, allow_flash: bool) -> None: + super().__init__() + if allow_flash and not FLASH_AVAILABLE: + warnings.warn( + 'FlashAttention is not available. For optimal speed, ' + 'consider installing torch >= 2.0 or flash-attn.', + stacklevel=2, + ) + self.enable_flash = allow_flash and FLASH_AVAILABLE + self.has_sdp = hasattr(F, 'scaled_dot_product_attention') + if allow_flash and FlashCrossAttention: + self.flash_ = FlashCrossAttention() + if self.has_sdp: + torch.backends.cuda.enable_flash_sdp(allow_flash) + + def forward(self, + q, + k, + v, + mask: Optional[torch.Tensor] = None) -> torch.Tensor: + if self.enable_flash and q.device.type == 'cuda': + # use torch 2.0 scaled_dot_product_attention with flash + if self.has_sdp: + args = [x.half().contiguous() for x in [q, k, v]] + v = F.scaled_dot_product_attention( + *args, attn_mask=mask).to(q.dtype) + return v if mask is None else v.nan_to_num() + else: + assert mask is None + q, k, v = [x.transpose(-2, -3).contiguous() for x in [q, k, v]] + m = self.flash_(q.half(), torch.stack([k, v], 2).half()) + return m.transpose(-2, -3).to(q.dtype).clone() + elif self.has_sdp: + args = [x.contiguous() for x in [q, k, v]] + v = F.scaled_dot_product_attention(*args, attn_mask=mask) + return v if mask is None else v.nan_to_num() + else: + s = q.shape[-1]**-0.5 + sim = torch.einsum('...id,...jd->...ij', q, k) * s + if mask is not None: + sim.masked_fill(~mask, -float('inf')) + attn = F.softmax(sim, -1) + return torch.einsum('...ij,...jd->...id', attn, v) + + +class SelfBlock(nn.Module): + + def __init__(self, + embed_dim: int, + num_heads: int, + flash: bool = False, + bias: bool = True) -> None: + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + assert self.embed_dim % num_heads == 0 + self.head_dim = self.embed_dim // num_heads + self.Wqkv = nn.Linear(embed_dim, 3 * embed_dim, bias=bias) + self.inner_attn = Attention(flash) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.ffn = nn.Sequential( + nn.Linear(2 * embed_dim, 2 * embed_dim), + nn.LayerNorm(2 * embed_dim, elementwise_affine=True), + nn.GELU(), + nn.Linear(2 * embed_dim, embed_dim), + ) + + def forward( + self, + x: torch.Tensor, + encoding: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + qkv = self.Wqkv(x) + qkv = qkv.unflatten(-1, (self.num_heads, -1, 3)).transpose(1, 2) + q, k, v = qkv[..., 0], qkv[..., 1], qkv[..., 2] + q = apply_cached_rotary_emb(encoding, q) + k = apply_cached_rotary_emb(encoding, k) + context = self.inner_attn(q, k, v, mask=mask) + message = self.out_proj(context.transpose(1, 2).flatten(start_dim=-2)) + return x + self.ffn(torch.cat([x, message], -1)) + + +class CrossBlock(nn.Module): + + def __init__(self, + embed_dim: int, + num_heads: int, + flash: bool = False, + bias: bool = True) -> None: + super().__init__() + self.heads = num_heads + dim_head = embed_dim // num_heads + self.scale = dim_head**-0.5 + inner_dim = dim_head * num_heads + self.to_qk = nn.Linear(embed_dim, inner_dim, bias=bias) + self.to_v = nn.Linear(embed_dim, inner_dim, bias=bias) + self.to_out = nn.Linear(inner_dim, embed_dim, bias=bias) + self.ffn = nn.Sequential( + nn.Linear(2 * embed_dim, 2 * embed_dim), + nn.LayerNorm(2 * embed_dim, elementwise_affine=True), + nn.GELU(), + nn.Linear(2 * embed_dim, embed_dim), + ) + if flash and FLASH_AVAILABLE: + self.flash = Attention(True) + else: + self.flash = None + + def map_(self, func: Callable, x0: torch.Tensor, x1: torch.Tensor): + return func(x0), func(x1) + + def forward(self, + x0: torch.Tensor, + x1: torch.Tensor, + mask: Optional[torch.Tensor] = None) -> List[torch.Tensor]: + qk0, qk1 = self.map_(self.to_qk, x0, x1) + v0, v1 = self.map_(self.to_v, x0, x1) + qk0, qk1, v0, v1 = map( + lambda t: t.unflatten(-1, (self.heads, -1)).transpose(1, 2), + (qk0, qk1, v0, v1), + ) + if self.flash is not None and qk0.device.type == 'cuda': + m0 = self.flash(qk0, qk1, v1, mask) + m1 = self.flash( + qk1, qk0, v0, + mask.transpose(-1, -2) if mask is not None else None) + else: + qk0, qk1 = qk0 * self.scale**0.5, qk1 * self.scale**0.5 + sim = torch.einsum('bhid, bhjd -> bhij', qk0, qk1) + if mask is not None: + sim = sim.masked_fill(~mask, -float('inf')) + attn01 = F.softmax(sim, dim=-1) + attn10 = F.softmax(sim.transpose(-2, -1).contiguous(), dim=-1) + m0 = torch.einsum('bhij, bhjd -> bhid', attn01, v1) + m1 = torch.einsum('bhji, bhjd -> bhid', attn10.transpose(-2, -1), + v0) + if mask is not None: + m0, m1 = m0.nan_to_num(), m1.nan_to_num() + m0, m1 = self.map_(lambda t: t.transpose(1, 2).flatten(start_dim=-2), + m0, m1) + m0, m1 = self.map_(self.to_out, m0, m1) + x0 = x0 + self.ffn(torch.cat([x0, m0], -1)) + x1 = x1 + self.ffn(torch.cat([x1, m1], -1)) + return x0, x1 + + +class TransformerLayer(nn.Module): + + def __init__(self, *args, **kwargs): + super().__init__() + self.self_attn = SelfBlock(*args, **kwargs) + self.cross_attn = CrossBlock(*args, **kwargs) + + def forward( + self, + desc0, + desc1, + encoding0, + encoding1, + mask0: Optional[torch.Tensor] = None, + mask1: Optional[torch.Tensor] = None, + ): + if mask0 is not None and mask1 is not None: + return self.masked_forward(desc0, desc1, encoding0, encoding1, + mask0, mask1) + else: + desc0 = self.self_attn(desc0, encoding0) + desc1 = self.self_attn(desc1, encoding1) + return self.cross_attn(desc0, desc1) + + # This part is compiled and allows padding inputs + def masked_forward(self, desc0, desc1, encoding0, encoding1, mask0, mask1): + mask = mask0 & mask1.transpose(-1, -2) + mask0 = mask0 & mask0.transpose(-1, -2) + mask1 = mask1 & mask1.transpose(-1, -2) + desc0 = self.self_attn(desc0, encoding0, mask0) + desc1 = self.self_attn(desc1, encoding1, mask1) + return self.cross_attn(desc0, desc1, mask) + + +def sigmoid_log_double_softmax(sim: torch.Tensor, z0: torch.Tensor, + z1: torch.Tensor) -> torch.Tensor: + """create the log assignment matrix from logits and similarity""" + b, m, n = sim.shape + certainties = F.logsigmoid(z0) + F.logsigmoid(z1).transpose(1, 2) + scores0 = F.log_softmax(sim, 2) + scores1 = F.log_softmax(sim.transpose(-1, -2).contiguous(), + 2).transpose(-1, -2) + scores = sim.new_full((b, m + 1, n + 1), 0) + scores[:, :m, :n] = scores0 + scores1 + certainties + scores[:, :-1, -1] = F.logsigmoid(-z0.squeeze(-1)) + scores[:, -1, :-1] = F.logsigmoid(-z1.squeeze(-1)) + return scores + + +class MatchAssignment(nn.Module): + + def __init__(self, dim: int) -> None: + super().__init__() + self.dim = dim + self.matchability = nn.Linear(dim, 1, bias=True) + self.final_proj = nn.Linear(dim, dim, bias=True) + + def forward(self, desc0: torch.Tensor, desc1: torch.Tensor): + """build assignment matrix from descriptors""" + mdesc0, mdesc1 = self.final_proj(desc0), self.final_proj(desc1) + _, _, d = mdesc0.shape + mdesc0, mdesc1 = mdesc0 / d**0.25, mdesc1 / d**0.25 + sim = torch.einsum('bmd,bnd->bmn', mdesc0, mdesc1) + z0 = self.matchability(desc0) + z1 = self.matchability(desc1) + scores = sigmoid_log_double_softmax(sim, z0, z1) + return scores, sim + + def get_matchability(self, desc: torch.Tensor): + return torch.sigmoid(self.matchability(desc)).squeeze(-1) + + +def filter_matches(scores: torch.Tensor, th: float): + """obtain matches from a log assignment matrix [Bx M+1 x N+1]""" + max0, max1 = scores[:, :-1, :-1].max(2), scores[:, :-1, :-1].max(1) + m0, m1 = max0.indices, max1.indices + indices0 = torch.arange(m0.shape[1], device=m0.device)[None] + indices1 = torch.arange(m1.shape[1], device=m1.device)[None] + mutual0 = indices0 == m1.gather(1, m0) + mutual1 = indices1 == m0.gather(1, m1) + max0_exp = max0.values.exp() + zero = max0_exp.new_tensor(0) + mscores0 = torch.where(mutual0, max0_exp, zero) + mscores1 = torch.where(mutual1, mscores0.gather(1, m1), zero) + valid0 = mutual0 & (mscores0 > th) + valid1 = mutual1 & valid0.gather(1, m1) + m0 = torch.where(valid0, m0, -1) + m1 = torch.where(valid1, m1, -1) + return m0, m1, mscores0, mscores1 + + +class LightGlue(nn.Module): + + # Point pruning involves an overhead (gather). + # Therefore, we only activate it if there are enough keypoints. + pruning_keypoint_thresholds = { + 'cpu': -1, + 'mps': -1, + 'cuda': 1024, + 'flash': 1536, + } + + required_data_keys = ['image0', 'image1'] + + version = 'v0.1_arxiv' + weight_path = '{}_lightglue.pth' + + features = { + 'superpoint': { + 'weights': 'superpoint_lightglue', + 'input_dim': 256, + }, + 'disk': { + 'weights': 'disk_lightglue', + 'input_dim': 128, + }, + 'aliked': { + 'weights': 'aliked_lightglue', + 'input_dim': 128, + }, + 'sift': { + 'weights': 'sift_lightglue', + 'input_dim': 128, + 'add_scale_ori': True, + }, + } + + def __init__(self, model_dir, default_conf, **conf) -> None: + super().__init__() + self.conf = conf = SimpleNamespace(**{**default_conf, **conf}) + if conf.features is not None: + if conf.features not in self.features: + raise ValueError( + f'Unsupported features: {conf.features} not in ' + f"{{{','.join(self.features)}}}") + for k, v in self.features[conf.features].items(): + setattr(conf, k, v) + + if conf.input_dim != conf.descriptor_dim: + self.input_proj = nn.Linear( + conf.input_dim, conf.descriptor_dim, bias=True) + else: + self.input_proj = nn.Identity() + + head_dim = conf.descriptor_dim // conf.num_heads + self.posenc = LearnableFourierPositionalEncoding( + 2 + 2 * self.conf.add_scale_ori, head_dim, head_dim) + + h, n, d = conf.num_heads, conf.n_layers, conf.descriptor_dim + + self.transformers = nn.ModuleList( + [TransformerLayer(d, h, conf.flash) for _ in range(n)]) + + self.log_assignment = nn.ModuleList( + [MatchAssignment(d) for _ in range(n)]) + self.token_confidence = nn.ModuleList( + [TokenConfidence(d) for _ in range(n - 1)]) + self.register_buffer( + 'confidence_thresholds', + torch.Tensor([ + self.confidence_threshold(i) for i in range(self.conf.n_layers) + ]), + ) + + state_dict = None + if conf.features is not None: + state_dict = torch.load( + osp.join(model_dir, self.weight_path.format(conf.features)), + map_location='cpu') + self.load_state_dict(state_dict, strict=False) + elif conf.weights is not None: + path = Path(__file__).parent + path = path / 'weights/{}.pth'.format(self.conf.weights) + state_dict = torch.load(str(path), map_location='cpu') + + if state_dict: + # rename old state dict entries + for i in range(self.conf.n_layers): + pattern = f'self_attn.{i}', f'transformers.{i}.self_attn' + state_dict = { + k.replace(*pattern): v + for k, v in state_dict.items() + } + pattern = f'cross_attn.{i}', f'transformers.{i}.cross_attn' + state_dict = { + k.replace(*pattern): v + for k, v in state_dict.items() + } + self.load_state_dict(state_dict, strict=False) + + # static lengths LightGlue is compiled for (only used with torch.compile) + self.static_lengths = None + + def compile(self, + mode='reduce-overhead', + static_lengths=[256, 512, 768, 1024, 1280, 1536]): + if self.conf.width_confidence != -1: + warnings.warn( + 'Point pruning is partially disabled for compiled forward.', + stacklevel=2, + ) + + for i in range(self.conf.n_layers): + self.transformers[i].masked_forward = torch.compile( + self.transformers[i].masked_forward, mode=mode, fullgraph=True) + + self.static_lengths = static_lengths + + def forward(self, data: dict) -> dict: + """ + Match keypoints and descriptors between two images + + Input (dict): + image0: dict + keypoints: [B x M x 2] + descriptors: [B x M x D] + image: [B x C x H x W] or image_size: [B x 2] + image1: dict + keypoints: [B x N x 2] + descriptors: [B x N x D] + image: [B x C x H x W] or image_size: [B x 2] + Output (dict): + log_assignment: [B x M+1 x N+1] + matches0: [B x M] + matching_scores0: [B x M] + matches1: [B x N] + matching_scores1: [B x N] + matches: List[[Si x 2]], scores: List[[Si]] + """ + with torch.autocast(enabled=self.conf.mp, device_type='cuda'): + return self._forward(data) + + def _forward(self, data: dict) -> dict: + for key in self.required_data_keys: + assert key in data, f'Missing key {key} in data' + data0, data1 = data['image0'], data['image1'] + kpts0, kpts1 = data0['keypoints'], data1['keypoints'] + b, m, _ = kpts0.shape + b, n, _ = kpts1.shape + device = kpts0.device + size0, size1 = data0.get('image_size'), data1.get('image_size') + kpts0 = normalize_keypoints(kpts0, size0).clone() + kpts1 = normalize_keypoints(kpts1, size1).clone() + + if self.conf.add_scale_ori: + kpts0 = torch.cat( + [kpts0] + [data0[k].unsqueeze(-1) for k in ('scales', 'oris')], + -1) + kpts1 = torch.cat( + [kpts1] + [data1[k].unsqueeze(-1) for k in ('scales', 'oris')], + -1) + desc0 = data0['descriptors'].detach().contiguous() + desc1 = data1['descriptors'].detach().contiguous() + + assert desc0.shape[-1] == self.conf.input_dim + assert desc1.shape[-1] == self.conf.input_dim + + if torch.is_autocast_enabled(): + desc0 = desc0.half() + desc1 = desc1.half() + + mask0, mask1 = None, None + c = max(m, n) + do_compile = self.static_lengths and c <= max(self.static_lengths) + if do_compile: + kn = min([k for k in self.static_lengths if k >= c]) + desc0, mask0 = pad_to_length(desc0, kn) + desc1, mask1 = pad_to_length(desc1, kn) + kpts0, _ = pad_to_length(kpts0, kn) + kpts1, _ = pad_to_length(kpts1, kn) + desc0 = self.input_proj(desc0) + desc1 = self.input_proj(desc1) + # cache positional embeddings + encoding0 = self.posenc(kpts0) + encoding1 = self.posenc(kpts1) + + # GNN + final_proj + assignment + do_early_stop = self.conf.depth_confidence > 0 + do_point_pruning = self.conf.width_confidence > 0 and not do_compile + pruning_th = self.pruning_min_kpts(device) + if do_point_pruning: + ind0 = torch.arange(0, m, device=device)[None] + ind1 = torch.arange(0, n, device=device)[None] + # We store the index of the layer at which pruning is detected. + prune0 = torch.ones_like(ind0) + prune1 = torch.ones_like(ind1) + token0, token1 = None, None + for i in range(self.conf.n_layers): + desc0, desc1 = self.transformers[i]( + desc0, desc1, encoding0, encoding1, mask0=mask0, mask1=mask1) + if i == self.conf.n_layers - 1: + continue # no early stopping or adaptive width at last layer + + if do_early_stop: + token0, token1 = self.token_confidence[i](desc0, desc1) + if self.check_if_stop(token0[..., :m, :], token1[..., :n, :], + i, m + n): + break + if do_point_pruning and desc0.shape[-2] > pruning_th: + scores0 = self.log_assignment[i].get_matchability(desc0) + prunemask0 = self.get_pruning_mask(token0, scores0, i) + keep0 = torch.where(prunemask0)[1] + ind0 = ind0.index_select(1, keep0) + desc0 = desc0.index_select(1, keep0) + encoding0 = encoding0.index_select(-2, keep0) + prune0[:, ind0] += 1 + if do_point_pruning and desc1.shape[-2] > pruning_th: + scores1 = self.log_assignment[i].get_matchability(desc1) + prunemask1 = self.get_pruning_mask(token1, scores1, i) + keep1 = torch.where(prunemask1)[1] + ind1 = ind1.index_select(1, keep1) + desc1 = desc1.index_select(1, keep1) + encoding1 = encoding1.index_select(-2, keep1) + prune1[:, ind1] += 1 + + desc0, desc1 = desc0[..., :m, :], desc1[..., :n, :] + scores, _ = self.log_assignment[i](desc0, desc1) + m0, m1, mscores0, mscores1 = filter_matches(scores, + self.conf.filter_threshold) + matches, mscores = [], [] + for k in range(b): + valid = m0[k] > -1 + m_indices_0 = torch.where(valid)[0] + m_indices_1 = m0[k][valid] + if do_point_pruning: + m_indices_0 = ind0[k, m_indices_0] + m_indices_1 = ind1[k, m_indices_1] + matches.append(torch.stack([m_indices_0, m_indices_1], -1)) + mscores.append(mscores0[k][valid]) + + # TODO: Remove when hloc switches to the compact format. + if do_point_pruning: + m0_ = torch.full((b, m), -1, device=m0.device, dtype=m0.dtype) + m1_ = torch.full((b, n), -1, device=m1.device, dtype=m1.dtype) + m0_[:, ind0] = torch.where(m0 == -1, -1, + ind1.gather(1, m0.clamp(min=0))) + m1_[:, ind1] = torch.where(m1 == -1, -1, + ind0.gather(1, m1.clamp(min=0))) + mscores0_ = torch.zeros((b, m), device=mscores0.device) + mscores1_ = torch.zeros((b, n), device=mscores1.device) + mscores0_[:, ind0] = mscores0 + mscores1_[:, ind1] = mscores1 + m0, m1, mscores0, mscores1 = m0_, m1_, mscores0_, mscores1_ + else: + prune0 = torch.ones_like(mscores0) * self.conf.n_layers + prune1 = torch.ones_like(mscores1) * self.conf.n_layers + + pred = { + 'matches0': m0, + 'matches1': m1, + 'matching_scores0': mscores0, + 'matching_scores1': mscores1, + 'stop': i + 1, + 'matches': matches, + 'scores': mscores, + 'prune0': prune0, + 'prune1': prune1, + } + + return pred + + def confidence_threshold(self, layer_index: int) -> float: + """scaled confidence threshold""" + threshold = 0.8 + 0.1 * np.exp(-4.0 * layer_index / self.conf.n_layers) + return np.clip(threshold, 0, 1) + + def get_pruning_mask(self, confidences: torch.Tensor, scores: torch.Tensor, + layer_index: int) -> torch.Tensor: + """mask points which should be removed""" + keep = scores > (1 - self.conf.width_confidence) + if confidences is not None: # Low-confidence points are never pruned. + keep |= confidences <= self.confidence_thresholds[layer_index] + return keep + + def check_if_stop( + self, + confidences0: torch.Tensor, + confidences1: torch.Tensor, + layer_index: int, + num_points: int, + ) -> torch.Tensor: + """evaluate stopping condition""" + confidences = torch.cat([confidences0, confidences1], -1) + threshold = self.confidence_thresholds[layer_index] + ratio_confident = 1.0 - ( + confidences < threshold).float().sum() / num_points # noqa E501 + return ratio_confident > self.conf.depth_confidence + + def pruning_min_kpts(self, device: torch.device): + if self.conf.flash and FLASH_AVAILABLE and device.type == 'cuda': + return self.pruning_keypoint_thresholds['flash'] + else: + return self.pruning_keypoint_thresholds[device.type] diff --git a/modelscope/models/cv/image_matching_fast/lightglue/sift.py b/modelscope/models/cv/image_matching_fast/lightglue/sift.py new file mode 100644 index 000000000..435d8f7f5 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/sift.py @@ -0,0 +1,221 @@ +import warnings + +import cv2 +import numpy as np +import torch +from kornia.color import rgb_to_grayscale +from packaging import version + +from .utils import Extractor + +try: + import pycolmap +except ImportError: + pycolmap = None + + +def filter_dog_point(points, + scales, + angles, + image_shape, + nms_radius, + scores=None): + h, w = image_shape + ij = np.round(points - 0.5).astype(int).T[::-1] + + # Remove duplicate points (identical coordinates). + # Pick highest scale or score + s = scales if scores is None else scores + buffer = np.zeros((h, w)) + np.maximum.at(buffer, tuple(ij), s) + keep = np.where(buffer[tuple(ij)] == s)[0] + + # Pick lowest angle (arbitrary). + ij = ij[:, keep] + buffer[:] = np.inf + o_abs = np.abs(angles[keep]) + np.minimum.at(buffer, tuple(ij), o_abs) + mask = buffer[tuple(ij)] == o_abs + ij = ij[:, mask] + keep = keep[mask] + + if nms_radius > 0: + # Apply NMS on the remaining points + buffer[:] = 0 + buffer[tuple(ij)] = s[keep] # scores or scale + + local_max = torch.nn.functional.max_pool2d( + torch.from_numpy(buffer).unsqueeze(0), + kernel_size=nms_radius * 2 + 1, + stride=1, + padding=nms_radius, + ).squeeze(0) + is_local_max = buffer == local_max.numpy() + keep = keep[is_local_max[tuple(ij)]] + return keep + + +def sift_to_rootsift(x: torch.Tensor, eps=1e-6) -> torch.Tensor: + x = torch.nn.functional.normalize(x, p=1, dim=-1, eps=eps) + x.clip_(min=eps).sqrt_() + return torch.nn.functional.normalize(x, p=2, dim=-1, eps=eps) + + +def run_opencv_sift(features: cv2.Feature2D, image: np.ndarray) -> np.ndarray: + """ + Detect keypoints using OpenCV Detector. + Optionally, perform description. + Args: + features: OpenCV based keypoints detector and descriptor + image: Grayscale image of uint8 data type + Returns: + keypoints: 1D array of detected cv2.KeyPoint + scores: 1D array of responses + descriptors: 1D array of descriptors + """ + detections, descriptors = features.detectAndCompute(image, None) + points = np.array([k.pt for k in detections], dtype=np.float32) + scores = np.array([k.response for k in detections], dtype=np.float32) + scales = np.array([k.size for k in detections], dtype=np.float32) + angles = np.deg2rad( + np.array([k.angle for k in detections], dtype=np.float32)) + return points, scores, scales, angles, descriptors + + +class SIFT(Extractor): + default_conf = { + 'rootsift': True, + 'nms_radius': 0, # None to disable filtering entirely. + 'max_num_keypoints': 4096, + 'backend': + 'opencv', # in {opencv, pycolmap, pycolmap_cpu, pycolmap_cuda} + 'detection_threshold': 0.0066667, # from COLMAP + 'edge_threshold': 10, + 'first_octave': -1, # only used by pycolmap, the default of COLMAP + 'num_octaves': 4, + } + + preprocess_conf = { + 'resize': 1024, + } + + required_data_keys = ['image'] + + def __init__(self, **conf): + super().__init__(**conf) # Update with default configuration. + backend = self.conf.backend + if backend.startswith('pycolmap'): + if pycolmap is None: + raise ImportError( + 'Cannot find module pycolmap: install it with pip' + 'or use backend=opencv.') + options = { + 'peak_threshold': self.conf.detection_threshold, + 'edge_threshold': self.conf.edge_threshold, + 'first_octave': self.conf.first_octave, + 'num_octaves': self.conf.num_octaves, + 'normalization': + pycolmap.Normalization.L2, # L1_ROOT is buggy. + } + device = ('auto' if backend == 'pycolmap' else backend.replace( + 'pycolmap_', '')) + if (backend == 'pycolmap_cpu' or not pycolmap.has_cuda + ) and pycolmap.__version__ < '0.5.0': # noqa E501 + warnings.warn( + 'The pycolmap CPU SIFT is buggy in version < 0.5.0, ' + 'consider upgrading pycolmap or use the CUDA version.', + stacklevel=1, + ) + else: + options['max_num_features'] = self.conf.max_num_keypoints + self.sift = pycolmap.Sift(options=options, device=device) + elif backend == 'opencv': + self.sift = cv2.SIFT_create( + contrastThreshold=self.conf.detection_threshold, + nfeatures=self.conf.max_num_keypoints, + edgeThreshold=self.conf.edge_threshold, + nOctaveLayers=self.conf.num_octaves, + ) + else: + backends = {'opencv', 'pycolmap', 'pycolmap_cpu', 'pycolmap_cuda'} + raise ValueError(f'Unknown backend: {backend} not in ' + f"{{{','.join(backends)}}}.") + + def extract_single_image(self, image: torch.Tensor): + image_np = image.cpu().numpy().squeeze(0) + + if self.conf.backend.startswith('pycolmap'): + if version.parse(pycolmap.__version__) >= version.parse('0.5.0'): + detections, descriptors = self.sift.extract(image_np) + scores = None # Scores are not exposed by COLMAP anymore. + else: + detections, scores, descriptors = self.sift.extract(image_np) + keypoints = detections[:, :2] # Keep only (x, y). + scales, angles = detections[:, -2:].T + if scores is not None and (self.conf.backend == 'pycolmap_cpu' + or not pycolmap.has_cuda): + # Set the scores as a combination of abs. response and scale. + scores = np.abs(scores) * scales + elif self.conf.backend == 'opencv': + # TODO: Check if opencv keypoints are already in corner convention + keypoints, scores, scales, angles, descriptors = run_opencv_sift( + self.sift, (image_np * 255.0).astype(np.uint8)) + pred = { + 'keypoints': keypoints, + 'scales': scales, + 'oris': angles, + 'descriptors': descriptors, + } + if scores is not None: + pred['keypoint_scores'] = scores + + # sometimes pycolmap returns points outside the image. We remove them + if self.conf.backend.startswith('pycolmap'): + is_inside = (pred['keypoints'] + 0.5 < np.array( + [image_np.shape[-2:][::-1]])).all(-1) + pred = {k: v[is_inside] for k, v in pred.items()} + + if self.conf.nms_radius is not None: + keep = filter_dog_point( + pred['keypoints'], + pred['scales'], + pred['oris'], + image_np.shape, + self.conf.nms_radius, + scores=pred.get('keypoint_scores'), + ) + pred = {k: v[keep] for k, v in pred.items()} + + pred = {k: torch.from_numpy(v) for k, v in pred.items()} + if scores is not None: + # Keep the k keypoints with highest score + num_points = self.conf.max_num_keypoints + if num_points is not None and len(pred['keypoints']) > num_points: + indices = torch.topk(pred['keypoint_scores'], + num_points).indices + pred = {k: v[indices] for k, v in pred.items()} + + return pred + + def forward(self, data: dict) -> dict: + image = data['image'] + if image.shape[1] == 3: + image = rgb_to_grayscale(image) + device = image.device + image = image.cpu() + pred = [] + for k in range(len(image)): + img = image[k] + if 'image_size' in data.keys(): + # avoid extracting points in padded areas + w, h = data['image_size'][k] + img = img[:, :h, :w] + p = self.extract_single_image(img) + pred.append(p) + pred = { + k: torch.stack([p[k] for p in pred], 0).to(device) + for k in pred[0] + } + if self.conf.rootsift: + pred['descriptors'] = sift_to_rootsift(pred['descriptors']) + return pred diff --git a/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py b/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py new file mode 100644 index 000000000..0f628458f --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/superpoint.py @@ -0,0 +1,223 @@ +# %BANNER_BEGIN% +# --------------------------------------------------------------------- +# %COPYRIGHT_BEGIN% +# +# Magic Leap, Inc. ("COMPANY") CONFIDENTIAL +# +# Unpublished Copyright (c) 2020 +# Magic Leap, Inc., All Rights Reserved. +# +# NOTICE: All information contained herein is, and remains the property +# of COMPANY. The intellectual and technical concepts contained herein +# are proprietary to COMPANY and may be covered by U.S. and Foreign +# Patents, patents in process, and are protected by trade secret or +# copyright law. Dissemination of this information or reproduction of +# this material is strictly forbidden unless prior written permission is +# obtained from COMPANY. Access to the source code contained herein is +# hereby forbidden to anyone except current COMPANY employees, managers +# or contractors who have executed Confidentiality and Non-disclosure +# agreements explicitly covering such access. +# +# The copyright notice above does not evidence any actual or intended +# publication or disclosure of this source code, which includes +# information that is confidential and/or proprietary, and is a trade +# secret, of COMPANY. ANY REPRODUCTION, MODIFICATION, DISTRIBUTION, +# PUBLIC PERFORMANCE, OR PUBLIC DISPLAY OF OR THROUGH USE OF THIS +# SOURCE CODE WITHOUT THE EXPRESS WRITTEN CONSENT OF COMPANY IS +# STRICTLY PROHIBITED, AND IN VIOLATION OF APPLICABLE LAWS AND +# INTERNATIONAL TREATIES. THE RECEIPT OR POSSESSION OF THIS SOURCE +# CODE AND/OR RELATED INFORMATION DOES NOT CONVEY OR IMPLY ANY RIGHTS +# TO REPRODUCE, DISCLOSE OR DISTRIBUTE ITS CONTENTS, OR TO MANUFACTURE, +# USE, OR SELL ANYTHING THAT IT MAY DESCRIBE, IN WHOLE OR IN PART. +# +# %COPYRIGHT_END% +# ---------------------------------------------------------------------- +# %AUTHORS_BEGIN% +# +# Originating Authors: Paul-Edouard Sarlin +# +# %AUTHORS_END% +# --------------------------------------------------------------------*/ +# %BANNER_END% + +# Adapted by Remi Pautrat, Philipp Lindenberger + +import os.path as osp + +import torch +from kornia.color import rgb_to_grayscale +from torch import nn + +from .utils import Extractor + + +def simple_nms(scores, nms_radius: int): + """Fast Non-maximum suppression to remove nearby points""" + assert nms_radius >= 0 + + def max_pool(x): + return torch.nn.functional.max_pool2d( + x, kernel_size=nms_radius * 2 + 1, stride=1, padding=nms_radius) + + zeros = torch.zeros_like(scores) + max_mask = scores == max_pool(scores) + for _ in range(2): + supp_mask = max_pool(max_mask.float()) > 0 + supp_scores = torch.where(supp_mask, zeros, scores) + new_max_mask = supp_scores == max_pool(supp_scores) + max_mask = max_mask | (new_max_mask & (~supp_mask)) + return torch.where(max_mask, scores, zeros) + + +def top_k_keypoints(keypoints, scores, k): + if k >= len(keypoints): + return keypoints, scores + scores, indices = torch.topk(scores, k, dim=0, sorted=True) + return keypoints[indices], scores + + +def sample_descriptors(keypoints, descriptors, s: int = 8): + """Interpolate descriptors at keypoint locations""" + b, c, h, w = descriptors.shape + keypoints = keypoints - s / 2 + 0.5 + keypoints /= torch.tensor([(w * s - s / 2 - 0.5), + (h * s - s / 2 - 0.5)], ).to(keypoints)[None] + keypoints = keypoints * 2 - 1 # normalize to (-1, 1) + args = {'align_corners': True} if torch.__version__ >= '1.3' else {} + descriptors = torch.nn.functional.grid_sample( + descriptors, keypoints.view(b, 1, -1, 2), mode='bilinear', **args) + descriptors = torch.nn.functional.normalize( + descriptors.reshape(b, c, -1), p=2, dim=1) + return descriptors + + +class SuperPoint(Extractor): + """SuperPoint Convolutional Detector and Descriptor + + SuperPoint: Self-Supervised Interest Point Detection and + Description. Daniel DeTone, Tomasz Malisiewicz, and Andrew + Rabinovich. In CVPRW, 2019. https://arxiv.org/abs/1712.07629 + + """ + + default_conf = { + 'descriptor_dim': 256, + 'nms_radius': 4, + 'max_num_keypoints': None, + 'detection_threshold': 0.0005, + 'remove_borders': 4, + } + + preprocess_conf = { + 'resize': 1024, + } + + required_data_keys = ['image'] + + def __init__(self, model_dir, **conf): + super().__init__(**conf) # Update with default configuration. + self.relu = nn.ReLU(inplace=True) + self.pool = nn.MaxPool2d(kernel_size=2, stride=2) + c1, c2, c3, c4, c5 = 64, 64, 128, 128, 256 + + self.conv1a = nn.Conv2d(1, c1, kernel_size=3, stride=1, padding=1) + self.conv1b = nn.Conv2d(c1, c1, kernel_size=3, stride=1, padding=1) + self.conv2a = nn.Conv2d(c1, c2, kernel_size=3, stride=1, padding=1) + self.conv2b = nn.Conv2d(c2, c2, kernel_size=3, stride=1, padding=1) + self.conv3a = nn.Conv2d(c2, c3, kernel_size=3, stride=1, padding=1) + self.conv3b = nn.Conv2d(c3, c3, kernel_size=3, stride=1, padding=1) + self.conv4a = nn.Conv2d(c3, c4, kernel_size=3, stride=1, padding=1) + self.conv4b = nn.Conv2d(c4, c4, kernel_size=3, stride=1, padding=1) + + self.convPa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1) + self.convPb = nn.Conv2d(c5, 65, kernel_size=1, stride=1, padding=0) + + self.convDa = nn.Conv2d(c4, c5, kernel_size=3, stride=1, padding=1) + self.convDb = nn.Conv2d( + c5, self.conf.descriptor_dim, kernel_size=1, stride=1, padding=0) + + weights_path = osp.join(model_dir, 'superpoint_v1.pth') + self.load_state_dict(torch.load(weights_path, map_location='cpu')) + + if self.conf.max_num_keypoints is not None and self.conf.max_num_keypoints <= 0: + raise ValueError('max_num_keypoints must be positive or None') + + def forward(self, data: dict) -> dict: + """Compute keypoints, scores, descriptors for image""" + for key in self.required_data_keys: + assert key in data, f'Missing key {key} in data' + image = data['image'] + if image.shape[1] == 3: + image = rgb_to_grayscale(image) + + # Shared Encoder + x = self.relu(self.conv1a(image)) + x = self.relu(self.conv1b(x)) + x = self.pool(x) + x = self.relu(self.conv2a(x)) + x = self.relu(self.conv2b(x)) + x = self.pool(x) + x = self.relu(self.conv3a(x)) + x = self.relu(self.conv3b(x)) + x = self.pool(x) + x = self.relu(self.conv4a(x)) + x = self.relu(self.conv4b(x)) + + # Compute the dense keypoint scores + cPa = self.relu(self.convPa(x)) + scores = self.convPb(cPa) + scores = torch.nn.functional.softmax(scores, 1)[:, :-1] + b, _, h, w = scores.shape + scores = scores.permute(0, 2, 3, 1).reshape(b, h, w, 8, 8) + scores = scores.permute(0, 1, 3, 2, 4).reshape(b, h * 8, w * 8) + scores = simple_nms(scores, self.conf.nms_radius) + + # Discard keypoints near the image borders + if self.conf.remove_borders: + pad = self.conf.remove_borders + scores[:, :pad] = -1 + scores[:, :, :pad] = -1 + scores[:, -pad:] = -1 + scores[:, :, -pad:] = -1 + + # Extract keypoints + best_kp = torch.where(scores > self.conf.detection_threshold) + scores = scores[best_kp] + + # Separate into batches + keypoints = [ + torch.stack(best_kp[1:3], dim=-1)[best_kp[0] == i] + for i in range(b) + ] + scores = [scores[best_kp[0] == i] for i in range(b)] + + # Keep the k keypoints with highest score + if self.conf.max_num_keypoints is not None: + keypoints, scores = list( + zip(*[ + top_k_keypoints(k, s, self.conf.max_num_keypoints) + for k, s in zip(keypoints, scores) + ])) + + # Convert (h, w) to (x, y) + keypoints = [torch.flip(k, [1]).float() for k in keypoints] + + # Compute the dense descriptors + cDa = self.relu(self.convDa(x)) + descriptors = self.convDb(cDa) + descriptors = torch.nn.functional.normalize(descriptors, p=2, dim=1) + + # Extract descriptors + descriptors = [ + sample_descriptors(k[None], d[None], 8)[0] + for k, d in zip(keypoints, descriptors) + ] + + return { + 'keypoints': + torch.stack(keypoints, 0), + 'keypoint_scores': + torch.stack(scores, 0), + 'descriptors': + torch.stack(descriptors, 0).transpose(-1, -2).contiguous(), + } diff --git a/modelscope/models/cv/image_matching_fast/lightglue/utils.py b/modelscope/models/cv/image_matching_fast/lightglue/utils.py new file mode 100644 index 000000000..86621e170 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/utils.py @@ -0,0 +1,172 @@ +import collections.abc as collections +from pathlib import Path +from types import SimpleNamespace +from typing import Callable, List, Optional, Tuple, Union + +import cv2 +import kornia +import numpy as np +import torch + + +class ImagePreprocessor: + default_conf = { + 'resize': None, # target edge length, None for no resizing + 'side': 'long', + 'interpolation': 'bilinear', + 'align_corners': None, + 'antialias': True, + } + + def __init__(self, **conf) -> None: + super().__init__() + self.conf = {**self.default_conf, **conf} + self.conf = SimpleNamespace(**self.conf) + + def __call__(self, img: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Resize and preprocess an image, return image and resize scale""" + h, w = img.shape[-2:] + if self.conf.resize is not None: + img = kornia.geometry.transform.resize( + img, + self.conf.resize, + side=self.conf.side, + antialias=self.conf.antialias, + align_corners=self.conf.align_corners, + ) + scale = torch.Tensor([img.shape[-1] / w, img.shape[-2] / h]).to(img) + return img, scale + + +def map_tensor(input_, func: Callable): + string_classes = (str, bytes) + if isinstance(input_, string_classes): + return input_ + elif isinstance(input_, collections.Mapping): + return {k: map_tensor(sample, func) for k, sample in input_.items()} + elif isinstance(input_, collections.Sequence): + return [map_tensor(sample, func) for sample in input_] + elif isinstance(input_, torch.Tensor): + return func(input_) + else: + return input_ + + +def batch_to_device(batch: dict, + device: str = 'cpu', + non_blocking: bool = True): + """Move batch (dict) to device""" + + def _func(tensor): + return tensor.to(device=device, non_blocking=non_blocking).detach() + + return map_tensor(batch, _func) + + +def rbd(data: dict) -> dict: + """Remove batch dimension from elements in data""" + return { + k: v[0] if isinstance(v, (torch.Tensor, np.ndarray, list)) else v + for k, v in data.items() + } + + +def read_image(path: Path, grayscale: bool = False) -> np.ndarray: + """Read an image from path as RGB or grayscale""" + if not Path(path).exists(): + raise FileNotFoundError(f'No image at path {path}.') + mode = cv2.IMREAD_GRAYSCALE if grayscale else cv2.IMREAD_COLOR + image = cv2.imread(str(path), mode) + if image is None: + raise IOError(f'Could not read image at {path}.') + if not grayscale: + image = image[..., ::-1] + return image + + +def numpy_image_to_torch(image: np.ndarray) -> torch.Tensor: + """Normalize the image tensor and reorder the dimensions.""" + if image.ndim == 3: + image = image.transpose((2, 0, 1)) # HxWxC to CxHxW + elif image.ndim == 2: + image = image[None] # add channel axis + else: + raise ValueError(f'Not an image: {image.shape}') + return torch.tensor(image / 255.0, dtype=torch.float) + + +def resize_image( + image: np.ndarray, + size: Union[List[int], int], + fn: str = 'max', + interp: Optional[str] = 'area', +) -> np.ndarray: + """Resize an image to a fixed size, or according to max or min edge.""" + h, w = image.shape[:2] + + fn = {'max': max, 'min': min}[fn] + if isinstance(size, int): + scale = size / fn(h, w) + h_new, w_new = int(round(h * scale)), int(round(w * scale)) + scale = (w_new / w, h_new / h) + elif isinstance(size, (tuple, list)): + h_new, w_new = size + scale = (w_new / w, h_new / h) + else: + raise ValueError(f'Incorrect new size: {size}') + mode = { + 'linear': cv2.INTER_LINEAR, + 'cubic': cv2.INTER_CUBIC, + 'nearest': cv2.INTER_NEAREST, + 'area': cv2.INTER_AREA, + }[interp] + return cv2.resize(image, (w_new, h_new), interpolation=mode), scale + + +def load_image(path: Path, resize: int = None, **kwargs) -> torch.Tensor: + image = read_image(path) + if resize is not None: + image, _ = resize_image(image, resize, **kwargs) + return numpy_image_to_torch(image) + + +class Extractor(torch.nn.Module): + + def __init__(self, **conf): + super().__init__() + self.conf = SimpleNamespace(**{**self.default_conf, **conf}) + + @torch.no_grad() + def extract(self, img: torch.Tensor, **conf) -> dict: + """Perform extraction with online resizing""" + if img.dim() == 3: + img = img[None] # add batch dim + assert img.dim() == 4 and img.shape[0] == 1 + shape = img.shape[-2:][::-1] + img, scales = ImagePreprocessor(**{ + **self.preprocess_conf, + **conf + })( + img) + feats = self.forward({'image': img}) + feats['image_size'] = torch.tensor(shape)[None].to(img).float() + feats['keypoints'] = (feats['keypoints'] + 0.5) / scales[None] - 0.5 + return feats + + +def match_pair( + extractor, + matcher, + image0: torch.Tensor, + image1: torch.Tensor, + device: str = 'cpu', + **preprocess, +): + """Match a pair of images (image0, image1) with an extractor and matcher""" + feats0 = extractor.extract(image0, **preprocess) + feats1 = extractor.extract(image1, **preprocess) + matches01 = matcher({'image0': feats0, 'image1': feats1}) + data = [feats0, feats1, matches01] + # remove batch dim and move to target device + feats0, feats1, matches01 = [batch_to_device(rbd(x), device) for x in data] + return feats0, feats1, matches01 diff --git a/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py b/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py new file mode 100644 index 000000000..13ea8a589 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue/viz2d.py @@ -0,0 +1,199 @@ +""" +2D visualization primitives based on Matplotlib. +1) Plot images with `plot_images`. +2) Call `plot_keypoints` or `plot_matches` any number of times. +3) Optionally: save a .png or .pdf plot (nice in papers!) with `save_plot`. +""" + +import matplotlib +import matplotlib.patheffects as path_effects +import matplotlib.pyplot as plt +import numpy as np +import torch + + +def cm_RdGn(x): + """Custom colormap: red (0) -> yellow (0.5) -> green (1).""" + x = np.clip(x, 0, 1)[..., None] * 2 + c = x * np.array([[0, 1.0, 0]]) + (2 - x) * np.array([[1.0, 0, 0]]) + return np.clip(c, 0, 1) + + +def cm_BlRdGn(x_): + """Custom colormap: blue (-1) -> red (0.0) -> green (1).""" + x = np.clip(x_, 0, 1)[..., None] * 2 + c = x * np.array([[0, 1.0, 0, 1.0]]) + (2 - x) * np.array( + [[1.0, 0, 0, 1.0]]) + + xn = -np.clip(x_, -1, 0)[..., None] * 2 + cn = xn * np.array([[0, 0.1, 1, 1.0]]) + (2 - xn) * np.array( + [[1.0, 0, 0, 1.0]]) + out = np.clip(np.where(x_[..., None] < 0, cn, c), 0, 1) + return out + + +def cm_prune(x_): + """Custom colormap to visualize pruning""" + if isinstance(x_, torch.Tensor): + x_ = x_.cpu().numpy() + max_i = max(x_) + norm_x = np.where(x_ == max_i, -1, (x_ - 1) / 9) + return cm_BlRdGn(norm_x) + + +def plot_images(imgs, + titles=None, + cmaps='gray', + dpi=100, + pad=0.5, + adaptive=True): + """Plot a set of images horizontally. + Args: + imgs: list of NumPy RGB (H, W, 3) or PyTorch RGB (3, H, W) or mono (H, W). + titles: a list of strings, as titles for each image. + cmaps: colormaps for monochrome images. + adaptive: whether the figure size should fit the image aspect ratios. + """ + # conversion to (H, W, 3) for torch.Tensor + imgs = [ + img.permute(1, 2, 0).cpu().numpy() if + (isinstance(img, torch.Tensor) and img.dim() == 3) else img + for img in imgs + ] + + n = len(imgs) + if not isinstance(cmaps, (list, tuple)): + cmaps = [cmaps] * n + + if adaptive: + ratios = [i.shape[1] / i.shape[0] for i in imgs] # W / H + else: + ratios = [4 / 3] * n + figsize = [sum(ratios) * 4.5, 4.5] + fig, ax = plt.subplots( + 1, n, figsize=figsize, dpi=dpi, gridspec_kw={'width_ratios': ratios}) + if n == 1: + ax = [ax] + for i in range(n): + ax[i].imshow(imgs[i], cmap=plt.get_cmap(cmaps[i])) + ax[i].get_yaxis().set_ticks([]) + ax[i].get_xaxis().set_ticks([]) + ax[i].set_axis_off() + for spine in ax[i].spines.values(): # remove frame + spine.set_visible(False) + if titles: + ax[i].set_title(titles[i]) + fig.tight_layout(pad=pad) + + +def plot_keypoints(kpts, colors='lime', ps=4, axes=None, a=1.0): + """Plot keypoints for existing images. + Args: + kpts: list of ndarrays of size (N, 2). + colors: string, or list of list of tuples (one for each keypoints). + ps: size of the keypoints as float. + """ + if not isinstance(colors, list): + colors = [colors] * len(kpts) + if not isinstance(a, list): + a = [a] * len(kpts) + if axes is None: + axes = plt.gcf().axes + for ax, k, c, alpha in zip(axes, kpts, colors, a): + if isinstance(k, torch.Tensor): + k = k.cpu().numpy() + ax.scatter(k[:, 0], k[:, 1], c=c, s=ps, linewidths=0, alpha=alpha) + + +def plot_matches(kpts0, + kpts1, + color=None, + lw=1.5, + ps=4, + a=1.0, + labels=None, + axes=None): + """Plot matches for a pair of existing images. + Args: + kpts0, kpts1: corresponding keypoints of size (N, 2). + color: color of each match, string or RGB tuple. Random if not given. + lw: width of the lines. + ps: size of the end points (no endpoint if ps=0) + indices: indices of the images to draw the matches on. + a: alpha opacity of the match lines. + """ + fig = plt.gcf() + if axes is None: + ax = fig.axes + ax0, ax1 = ax[0], ax[1] + else: + ax0, ax1 = axes + if isinstance(kpts0, torch.Tensor): + kpts0 = kpts0.cpu().numpy() + if isinstance(kpts1, torch.Tensor): + kpts1 = kpts1.cpu().numpy() + assert len(kpts0) == len(kpts1) + if color is None: + color = matplotlib.cm.hsv(np.random.rand(len(kpts0))).tolist() + elif len(color) > 0 and not isinstance(color[0], (tuple, list)): + color = [color] * len(kpts0) + + if lw > 0: + for i in range(len(kpts0)): + line = matplotlib.patches.ConnectionPatch( + xyA=(kpts0[i, 0], kpts0[i, 1]), + xyB=(kpts1[i, 0], kpts1[i, 1]), + coordsA=ax0.transData, + coordsB=ax1.transData, + axesA=ax0, + axesB=ax1, + zorder=1, + color=color[i], + linewidth=lw, + clip_on=True, + alpha=a, + label=None if labels is None else labels[i], + picker=5.0, + ) + line.set_annotation_clip(True) + fig.add_artist(line) + + # freeze the axes to prevent the transform to change + ax0.autoscale(enable=False) + ax1.autoscale(enable=False) + + if ps > 0: + ax0.scatter(kpts0[:, 0], kpts0[:, 1], c=color, s=ps) + ax1.scatter(kpts1[:, 0], kpts1[:, 1], c=color, s=ps) + + +def add_text( + idx, + text, + pos=(0.01, 0.99), + fs=15, + color='w', + lcolor='k', + lwidth=2, + ha='left', + va='top', +): + ax = plt.gcf().axes[idx] + t = ax.text( + *pos, + text, + fontsize=fs, + ha=ha, + va=va, + color=color, + transform=ax.transAxes) + if lcolor is not None: + t.set_path_effects([ + path_effects.Stroke(linewidth=lwidth, foreground=lcolor), + path_effects.Normal(), + ]) + + +def save_plot(path, **kw): + """Save the current figure without any white margin.""" + plt.savefig(path, bbox_inches='tight', pad_inches=0, **kw) diff --git a/modelscope/models/cv/image_matching_fast/lightglue_model.py b/modelscope/models/cv/image_matching_fast/lightglue_model.py new file mode 100644 index 000000000..8043051c2 --- /dev/null +++ b/modelscope/models/cv/image_matching_fast/lightglue_model.py @@ -0,0 +1,98 @@ +# The implementation is made publicly available under the +# Apache 2.0 license at https://github.com/cvg/LightGlue + +import os.path as osp +from pathlib import Path + +import cv2 +import numpy as np +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks +from .config.default import lightglue_default_conf +from .lightglue import ALIKED, DISK, SIFT, LightGlue, SuperPoint +from .lightglue.utils import numpy_image_to_torch, rbd + + +@MODELS.register_module( + Tasks.image_matching, module_name=Models.lightglue_image_matching) +class LightGlueImageMatching(TorchModel): + ''' + LightGlue is an simple but effective enhancement of the state-of-the-art image matching method, SuperGlue. + For more details, please refer to https://arxiv.org/abs/2306.13643 + ''' + + def __init__(self, model_dir: str, max_num_keypoints=2048, **kwargs): + + super().__init__(model_dir, **kwargs) + + self.device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu') # 'mps', 'cpu' + + features = lightglue_default_conf.get('features', 'superpoint') + + if features == 'disk': + self.extractor = DISK( + max_num_keypoints=max_num_keypoints).eval().to(self.device) + elif features == 'aliked': + self.extractor = ALIKED( + max_num_keypoints=max_num_keypoints).eval().to(self.device) + elif features == 'sift': + self.extractor = SIFT( + max_num_keypoints=max_num_keypoints).eval().to(self.device) + else: + self.extractor = SuperPoint( + model_dir=model_dir, + max_num_keypoints=max_num_keypoints).eval().to(self.device) + + self.matcher = LightGlue( + model_dir=model_dir, + default_conf=lightglue_default_conf).eval().to(self.device) + + def forward(self, inputs): + ''' + Args: + inputs: a dict with keys 'image0', 'image1' + ''' + + feats0 = self.extractor.extract( + numpy_image_to_torch(inputs['image0']).to(self.device)) + feats1 = self.extractor.extract( + numpy_image_to_torch(inputs['image1']).to(self.device)) + matches01 = self.matcher({'image0': feats0, 'image1': feats1}) + + return [feats0, feats1, matches01] + + def postprocess(self, inputs): + ''' + Args: + inputs: a list of feats0, feats1, matches01 + ''' + matching_result = inputs + feats0, feats1, matches01 = [rbd(x) for x in matching_result + ] # remove batch dimension + + kpts0, kpts1, matches = feats0['keypoints'], feats1[ + 'keypoints'], matches01['matches'] + m_kpts0, m_kpts1 = kpts0[matches[..., 0]], kpts1[matches[..., 1]] + + # match confidence + confidence = matches01['scores'] + + matches_result = { + 'kpts0': m_kpts0, + 'kpts1': m_kpts1, + 'confidence': confidence + } + + results = {OutputKeys.MATCHES: matches_result} + return results + + def inference(self, data): + results = self.forward(data) + + return results diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/__init__.py new file mode 100644 index 000000000..691834510 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .geomvsnet_model import GeoMVSNetDepthEstimation + +else: + _import_structure = { + 'geomvsnet_model': ['GeoMVSNetDepthEstimation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/colmap2mvsnet.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/colmap2mvsnet.py new file mode 100644 index 000000000..37d92c13a --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/colmap2mvsnet.py @@ -0,0 +1,472 @@ +# The implementation is borrowed from https://github.com/YoYo000/MVSNet. Model reading is provided by COLMAP. + +from __future__ import print_function +import collections +import multiprocessing as mp +import os +import shutil +import struct +from functools import partial + +import cv2 +import numpy as np + +# ============================ read_model.py ============================# +CameraModel = collections.namedtuple('CameraModel', + ['model_id', 'model_name', 'num_params']) +Camera = collections.namedtuple('Camera', + ['id', 'model', 'width', 'height', 'params']) +BaseImage = collections.namedtuple( + 'Image', ['id', 'qvec', 'tvec', 'camera_id', 'name', 'xys', 'point3D_ids']) +Point3D = collections.namedtuple( + 'Point3D', ['id', 'xyz', 'rgb', 'error', 'image_ids', 'point2D_idxs']) + + +class Image(BaseImage): + + def qvec2rotmat(self): + return qvec2rotmat(self.qvec) + + +CAMERA_MODELS = { + CameraModel(model_id=0, model_name='SIMPLE_PINHOLE', num_params=3), + CameraModel(model_id=1, model_name='PINHOLE', num_params=4), + CameraModel(model_id=2, model_name='SIMPLE_RADIAL', num_params=4), + CameraModel(model_id=3, model_name='RADIAL', num_params=5), + CameraModel(model_id=4, model_name='OPENCV', num_params=8), + CameraModel(model_id=5, model_name='OPENCV_FISHEYE', num_params=8), + CameraModel(model_id=6, model_name='FULL_OPENCV', num_params=12), + CameraModel(model_id=7, model_name='FOV', num_params=5), + CameraModel(model_id=8, model_name='SIMPLE_RADIAL_FISHEYE', num_params=4), + CameraModel(model_id=9, model_name='RADIAL_FISHEYE', num_params=5), + CameraModel(model_id=10, model_name='THIN_PRISM_FISHEYE', num_params=12) +} +CAMERA_MODEL_IDS = dict([(camera_model.model_id, camera_model) + for camera_model in CAMERA_MODELS]) + + +def read_next_bytes(fid, + num_bytes, + format_char_sequence, + endian_character='<'): + """Read and unpack the next bytes from a binary file. + :param fid: + :param num_bytes: Sum of combination of {2, 4, 8}, e.g. 2, 6, 16, 30, etc. + :param format_char_sequence: List of {c, e, f, d, h, H, i, I, l, L, q, Q}. + :param endian_character: Any of {@, =, <, >, !} + :return: Tuple of read and unpacked values. + """ + data = fid.read(num_bytes) + return struct.unpack(endian_character + format_char_sequence, data) + + +def read_cameras_text(path): + cameras = {} + with open(path, 'r', encoding='utf-8') as fid: + while True: + line = fid.readline() + if not line: + break + line = line.strip() + if len(line) > 0 and line[0] != '#': + elems = line.split() + camera_id = int(elems[0]) + model = elems[1] + width = int(elems[2]) + height = int(elems[3]) + params = np.array(tuple(map(float, elems[4:]))) + cameras[camera_id] = Camera( + id=camera_id, + model=model, + width=width, + height=height, + params=params) + return cameras + + +def read_cameras_binary(path_to_model_file): + cameras = {} + with open(path_to_model_file, 'rb') as fid: + num_cameras = read_next_bytes(fid, 8, 'Q')[0] + for camera_line_index in range(num_cameras): + camera_properties = read_next_bytes( + fid, num_bytes=24, format_char_sequence='iiQQ') + camera_id = camera_properties[0] + model_id = camera_properties[1] + model_name = CAMERA_MODEL_IDS[camera_properties[1]].model_name + width = camera_properties[2] + height = camera_properties[3] + num_params = CAMERA_MODEL_IDS[model_id].num_params + params = read_next_bytes( + fid, + num_bytes=8 * num_params, + format_char_sequence='d' * num_params) + cameras[camera_id] = Camera( + id=camera_id, + model=model_name, + width=width, + height=height, + params=np.array(params)) + assert len(cameras) == num_cameras + return cameras + + +def read_images_text(path): + images = {} + with open(path, 'r', encoding='utf-8') as fid: + while True: + line = fid.readline() + if not line: + break + line = line.strip() + if len(line) > 0 and line[0] != '#': + elems = line.split() + image_id = int(elems[0]) + qvec = np.array(tuple(map(float, elems[1:5]))) + tvec = np.array(tuple(map(float, elems[5:8]))) + camera_id = int(elems[8]) + image_name = elems[9] + elems = fid.readline().split() + xys = np.column_stack([ + tuple(map(float, elems[0::3])), + tuple(map(float, elems[1::3])) + ]) + point3D_ids = np.array(tuple(map(int, elems[2::3]))) + images[image_id] = Image( + id=image_id, + qvec=qvec, + tvec=tvec, + camera_id=camera_id, + name=image_name, + xys=xys, + point3D_ids=point3D_ids) + return images + + +def read_images_binary(path_to_model_file): + images = {} + with open(path_to_model_file, 'rb') as fid: + num_reg_images = read_next_bytes(fid, 8, 'Q')[0] + for image_index in range(num_reg_images): + binary_image_properties = read_next_bytes( + fid, num_bytes=64, format_char_sequence='idddddddi') + image_id = binary_image_properties[0] + qvec = np.array(binary_image_properties[1:5]) + tvec = np.array(binary_image_properties[5:8]) + camera_id = binary_image_properties[8] + image_name = '' + current_char = read_next_bytes(fid, 1, 'c')[0] + while current_char != b'\x00': # look for the ASCII 0 entry + image_name += current_char.decode('utf-8') + current_char = read_next_bytes(fid, 1, 'c')[0] + num_points2D = read_next_bytes( + fid, num_bytes=8, format_char_sequence='Q')[0] + x_y_id_s = read_next_bytes( + fid, + num_bytes=24 * num_points2D, + format_char_sequence='ddq' * num_points2D) + xys = np.column_stack([ + tuple(map(float, x_y_id_s[0::3])), + tuple(map(float, x_y_id_s[1::3])) + ]) + point3D_ids = np.array(tuple(map(int, x_y_id_s[2::3]))) + images[image_id] = Image( + id=image_id, + qvec=qvec, + tvec=tvec, + camera_id=camera_id, + name=image_name, + xys=xys, + point3D_ids=point3D_ids) + return images + + +def read_points3D_text(path): + points3D = {} + with open(path, 'r', encoding='utf-8') as fid: + while True: + line = fid.readline() + if not line: + break + line = line.strip() + if len(line) > 0 and line[0] != '#': + elems = line.split() + point3D_id = int(elems[0]) + xyz = np.array(tuple(map(float, elems[1:4]))) + rgb = np.array(tuple(map(int, elems[4:7]))) + error = float(elems[7]) + image_ids = np.array(tuple(map(int, elems[8::2]))) + point2D_idxs = np.array(tuple(map(int, elems[9::2]))) + points3D[point3D_id] = Point3D( + id=point3D_id, + xyz=xyz, + rgb=rgb, + error=error, + image_ids=image_ids, + point2D_idxs=point2D_idxs) + return points3D + + +def read_points3d_binary(path_to_model_file): + points3D = {} + with open(path_to_model_file, 'rb') as fid: + num_points = read_next_bytes(fid, 8, 'Q')[0] + for point_line_index in range(num_points): + binary_point_line_properties = read_next_bytes( + fid, num_bytes=43, format_char_sequence='QdddBBBd') + point3D_id = binary_point_line_properties[0] + xyz = np.array(binary_point_line_properties[1:4]) + rgb = np.array(binary_point_line_properties[4:7]) + error = np.array(binary_point_line_properties[7]) + track_length = read_next_bytes( + fid, num_bytes=8, format_char_sequence='Q')[0] + track_elems = read_next_bytes( + fid, + num_bytes=8 * track_length, + format_char_sequence='ii' * track_length) + image_ids = np.array(tuple(map(int, track_elems[0::2]))) + point2D_idxs = np.array(tuple(map(int, track_elems[1::2]))) + points3D[point3D_id] = Point3D( + id=point3D_id, + xyz=xyz, + rgb=rgb, + error=error, + image_ids=image_ids, + point2D_idxs=point2D_idxs) + return points3D + + +def read_model(path, ext): + if ext == '.txt': + cameras = read_cameras_text(os.path.join(path, 'cameras' + ext)) + images = read_images_text(os.path.join(path, 'images' + ext)) + points3D = read_points3D_text(os.path.join(path, 'points3D') + ext) + else: + cameras = read_cameras_binary(os.path.join(path, 'cameras' + ext)) + images = read_images_binary(os.path.join(path, 'images' + ext)) + points3D = read_points3d_binary(os.path.join(path, 'points3D') + ext) + return cameras, images, points3D + + +def qvec2rotmat(qvec): + return np.array([ + [ + 1 - 2 * qvec[2]**2 - 2 * qvec[3]**2, + 2 * qvec[1] * qvec[2] - 2 * qvec[0] * qvec[3], + 2 * qvec[3] * qvec[1] + 2 * qvec[0] * qvec[2] + ], # noqa + [ + 2 * qvec[1] * qvec[2] + 2 * qvec[0] * qvec[3], + 1 - 2 * qvec[1]**2 - 2 * qvec[3]**2, + 2 * qvec[2] * qvec[3] - 2 * qvec[0] * qvec[1] + ], # noqa + [ + 2 * qvec[3] * qvec[1] - 2 * qvec[0] * qvec[2], + 2 * qvec[2] * qvec[3] + 2 * qvec[0] * qvec[1], + 1 - 2 * qvec[1]**2 - 2 * qvec[2]**2 + ] + ]) # noqa + + +def rotmat2qvec(R): + Rxx, Ryx, Rzx, Rxy, Ryy, Rzy, Rxz, Ryz, Rzz = R.flat + K = np.array( + [[Rxx - Ryy - Rzz, 0, 0, 0], [Ryx + Rxy, Ryy - Rxx - Rzz, 0, 0], + [Rzx + Rxz, Rzy + Ryz, Rzz - Rxx - Ryy, 0], + [Ryz - Rzy, Rzx - Rxz, Rxy - Ryx, Rxx + Ryy + Rzz]]) / 3.0 # noqa + eigvals, eigvecs = np.linalg.eigh(K) + qvec = eigvecs[[3, 0, 1, 2], np.argmax(eigvals)] + if qvec[0] < 0: + qvec *= -1 + return qvec + + +def calc_score(inputs, images, points3d, extrinsic, args): + i, j = inputs + id_i = images[i + 1].point3D_ids + id_j = images[j + 1].point3D_ids + id_intersect = [it for it in id_i if it in id_j] + cam_center_i = -np.matmul(extrinsic[i + 1][:3, :3].transpose(), + extrinsic[i + 1][:3, 3:4])[:, 0] + cam_center_j = -np.matmul(extrinsic[j + 1][:3, :3].transpose(), + extrinsic[j + 1][:3, 3:4])[:, 0] + score = 0 + for pid in id_intersect: + if pid == -1: + continue + p = points3d[pid].xyz + theta = (180 / np.pi) * np.arccos( + np.dot(cam_center_i - p, cam_center_j - p) + / np.linalg.norm(cam_center_i - p) + / np.linalg.norm(cam_center_j - p)) + tmp_value = ( + 2 * # noqa + (args.sigma1 if theta <= args.theta0 else args.sigma2)**2) + score += np.exp(-(theta - args.theta0) * # noqa + (theta - args.theta0) / tmp_value) + return i, j, score + + +def processing_single_scene(args): + + image_dir = os.path.join(args.dense_folder, 'images') + model_dir = os.path.join(args.dense_folder, 'sparse') + cam_dir = os.path.join(args.save_folder, 'cams') + image_converted_dir = os.path.join(args.save_folder, 'images_post') + + if os.path.exists(image_converted_dir): + shutil.rmtree(image_converted_dir) + os.makedirs(image_converted_dir) + if os.path.exists(cam_dir): + shutil.rmtree(cam_dir) + + cameras, images, points3d = read_model(model_dir, args.model_ext) + num_images = len(list(images.items())) + + param_type = { + 'SIMPLE_PINHOLE': ['f', 'cx', 'cy'], + 'PINHOLE': ['fx', 'fy', 'cx', 'cy'], + 'SIMPLE_RADIAL': ['f', 'cx', 'cy', 'k'], + 'SIMPLE_RADIAL_FISHEYE': ['f', 'cx', 'cy', 'k'], + 'RADIAL': ['f', 'cx', 'cy', 'k1', 'k2'], + 'RADIAL_FISHEYE': ['f', 'cx', 'cy', 'k1', 'k2'], + 'OPENCV': ['fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2'], + 'OPENCV_FISHEYE': ['fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'k3', 'k4'], + 'FULL_OPENCV': [ + 'fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2', 'k3', 'k4', 'k5', + 'k6' + ], + 'FOV': ['fx', 'fy', 'cx', 'cy', 'omega'], + 'THIN_PRISM_FISHEYE': [ + 'fx', 'fy', 'cx', 'cy', 'k1', 'k2', 'p1', 'p2', 'k3', 'k4', 'sx1', + 'sy1' + ] + } + + # intrinsic + intrinsic = {} + for camera_id, cam in cameras.items(): + params_dict = { + key: value + for key, value in zip(param_type[cam.model], cam.params) + } + if 'f' in param_type[cam.model]: + params_dict['fx'] = params_dict['f'] + params_dict['fy'] = params_dict['f'] + i = np.array([[params_dict['fx'], 0, params_dict['cx']], + [0, params_dict['fy'], params_dict['cy']], [0, 0, 1]]) + intrinsic[camera_id] = i + + new_images = {} + for i, image_id in enumerate(sorted(images.keys())): + new_images[i + 1] = images[image_id] + images = new_images + + # extrinsic + extrinsic = {} + for image_id, image in images.items(): + e = np.zeros((4, 4)) + e[:3, :3] = qvec2rotmat(image.qvec) + e[:3, 3] = image.tvec + e[3, 3] = 1 + extrinsic[image_id] = e + + # depth range and interval + depth_ranges = {} + for i in range(num_images): + zs = [] + for p3d_id in images[i + 1].point3D_ids: + if p3d_id == -1: + continue + transformed = np.matmul(extrinsic[i + 1], [ + points3d[p3d_id].xyz[0], points3d[p3d_id].xyz[1], + points3d[p3d_id].xyz[2], 1 + ]) + zs.append(transformed[2].item()) + zs_sorted = sorted(zs) + # relaxed depth range + max_ratio = 0.1 + min_ratio = 0.03 + num_max = max(5, int(len(zs) * max_ratio)) + num_min = max(1, int(len(zs) * min_ratio)) + depth_min = 1.0 * sum(zs_sorted[:num_min]) / len(zs_sorted[:num_min]) + depth_max = 1.0 * sum(zs_sorted[-num_max:]) / len(zs_sorted[-num_max:]) + if args.max_d == 0: + image_int = intrinsic[images[i + 1].camera_id] + image_ext = extrinsic[i + 1] + image_r = image_ext[0:3, 0:3] + image_t = image_ext[0:3, 3] + p1 = [image_int[0, 2], image_int[1, 2], 1] + p2 = [image_int[0, 2] + 1, image_int[1, 2], 1] + P1 = np.matmul(np.linalg.inv(image_int), p1) * depth_min + P1 = np.matmul(np.linalg.inv(image_r), (P1 - image_t)) + P2 = np.matmul(np.linalg.inv(image_int), p2) * depth_min + P2 = np.matmul(np.linalg.inv(image_r), (P2 - image_t)) + depth_num = (1 / depth_min - 1 / depth_max) / ( + 1 / depth_min - 1 / (depth_min + np.linalg.norm(P2 - P1))) + else: + depth_num = args.max_d + depth_interval = (depth_max - depth_min) / (depth_num + - 1) / args.interval_scale + depth_ranges[i + 1] = (depth_min, depth_interval, depth_num, depth_max) + + # view selection + score = np.zeros((len(images), len(images))) + queue = [] + for i in range(len(images)): + for j in range(i + 1, len(images)): + queue.append((i, j)) + + p = mp.Pool(processes=mp.cpu_count()) + func = partial( + calc_score, + images=images, + points3d=points3d, + args=args, + extrinsic=extrinsic) + result = p.map(func, queue) + for i, j, s in result: + score[i, j] = s + score[j, i] = s + view_sel = [] + for i in range(len(images)): + sorted_score = np.argsort(score[i])[::-1] + view_sel.append([(k, score[i, k]) for k in sorted_score[:10]]) + + # write + os.makedirs(cam_dir, exist_ok=True) + + for i in range(num_images): + with open(os.path.join(cam_dir, '%08d_cam.txt' % i), 'w') as f: + f.write('extrinsic\n') + for j in range(4): + for k in range(4): + f.write(str(extrinsic[i + 1][j, k]) + ' ') + f.write('\n') + f.write('\nintrinsic\n') + for j in range(3): + for k in range(3): + f.write( + str(intrinsic[images[i + 1].camera_id][j, k]) + ' ') + f.write('\n') + f.write('\n%f %f %f %f\n' % + (depth_ranges[i + 1][0], depth_ranges[i + 1][1], + depth_ranges[i + 1][2], depth_ranges[i + 1][3])) + with open(os.path.join(args.save_folder, 'pair.txt'), 'w') as f: + f.write('%d\n' % len(images)) + for i, sorted_score in enumerate(view_sel): + f.write('%d\n%d ' % (i, len(sorted_score))) + for image_id, s in sorted_score: + f.write('%d %f ' % (image_id, s)) + f.write('\n') + + # convert to jpg + for i in range(num_images): + img_path = os.path.join(image_dir, images[i + 1].name) + if not img_path.endswith('.jpg'): + img = cv2.imread(img_path) + cv2.imwrite(os.path.join(image_converted_dir, '%08d.jpg' % i), img) + else: + shutil.copyfile( + os.path.join(image_dir, images[i + 1].name), + os.path.join(image_converted_dir, '%08d.jpg' % i)) diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/depth_filter.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/depth_filter.py new file mode 100644 index 000000000..05f1214a9 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/depth_filter.py @@ -0,0 +1,249 @@ +# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch +import os + +import cv2 +import numpy as np +from PIL import Image +from plyfile import PlyData, PlyElement + +from .general_eval_dataset import read_pfm + + +# read intrinsics and extrinsics +def read_camera_parameters(filename): + with open(filename) as f: + lines = f.readlines() + lines = [line.rstrip() for line in lines] + # extrinsics: line [1,5), 4x4 matrix + extrinsics = np.fromstring( + ' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4)) + # intrinsics: line [7-10), 3x3 matrix + intrinsics = np.fromstring( + ' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3)) + # assume the feature is 1/4 of the original image size + # intrinsics[:2, :] /= 4 + return intrinsics, extrinsics + + +# read an image +def read_img(filename): + img = Image.open(filename) + # scale 0~255 to 0~1 + np_img = np.array(img, dtype=np.float32) / 255. + return np_img + + +# read a binary mask +def read_mask(filename): + return read_img(filename) > 0.5 + + +# save a binary mask +def save_mask(filename, mask): + assert mask.dtype == bool + mask = mask.astype(np.uint8) * 255 + Image.fromarray(mask).save(filename) + + +# read a pair file, [(ref_view1, [src_view1-1, ...]), (ref_view2, [src_view2-1, ...]), ...] +def read_pair_file(filename): + data = [] + with open(filename) as f: + num_viewpoint = int(f.readline()) + # 49 viewpoints + for view_idx in range(num_viewpoint): + ref_view = int(f.readline().rstrip()) + src_views = [int(x) for x in f.readline().rstrip().split()[1::2]] + if len(src_views) > 0: + data.append((ref_view, src_views)) + return data + + +# project the reference point cloud into the source view, then project back +def reproject_with_depth(depth_ref, intrinsics_ref, extrinsics_ref, depth_src, + intrinsics_src, extrinsics_src): + width, height = depth_ref.shape[1], depth_ref.shape[0] + # step1. project reference pixels to the source view + # reference view x, y + x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height)) + x_ref, y_ref = x_ref.reshape([-1]), y_ref.reshape([-1]) + # reference 3D space + xyz_ref = np.matmul( + np.linalg.inv(intrinsics_ref), + np.vstack( + (x_ref, y_ref, np.ones_like(x_ref))) * depth_ref.reshape([-1])) + # source 3D space + xyz_src = np.matmul( + np.matmul(extrinsics_src, np.linalg.inv(extrinsics_ref)), + np.vstack((xyz_ref, np.ones_like(x_ref))))[:3] + # source view x, y + K_xyz_src = np.matmul(intrinsics_src, xyz_src) + xy_src = K_xyz_src[:2] / K_xyz_src[2:3] + + # step2. reproject the source view points with source view depth estimation + # find the depth estimation of the source view + x_src = xy_src[0].reshape([height, width]).astype(np.float32) + y_src = xy_src[1].reshape([height, width]).astype(np.float32) + sampled_depth_src = cv2.remap( + depth_src, x_src, y_src, interpolation=cv2.INTER_LINEAR) + + # source 3D space + # NOTE that we should use sampled source-view depth_here to project back + xyz_src = np.matmul( + np.linalg.inv(intrinsics_src), + np.vstack( + (xy_src, np.ones_like(x_ref))) * sampled_depth_src.reshape([-1])) + # reference 3D space + xyz_reprojected = np.matmul( + np.matmul(extrinsics_ref, np.linalg.inv(extrinsics_src)), + np.vstack((xyz_src, np.ones_like(x_ref))))[:3] + # source view x, y, depth + depth_reprojected = xyz_reprojected[2].reshape([height, + width]).astype(np.float32) + K_xyz_reprojected = np.matmul(intrinsics_ref, xyz_reprojected) + xy_reprojected = K_xyz_reprojected[:2] / K_xyz_reprojected[2:3] + x_reprojected = xy_reprojected[0].reshape([height, + width]).astype(np.float32) + y_reprojected = xy_reprojected[1].reshape([height, + width]).astype(np.float32) + + return depth_reprojected, x_reprojected, y_reprojected, x_src, y_src + + +def check_geometric_consistency(depth_ref, intrinsics_ref, extrinsics_ref, + depth_src, intrinsics_src, extrinsics_src): + width, height = depth_ref.shape[1], depth_ref.shape[0] + x_ref, y_ref = np.meshgrid(np.arange(0, width), np.arange(0, height)) + depth_reprojected, x2d_reprojected, y2d_reprojected, x2d_src, y2d_src = reproject_with_depth( + depth_ref, intrinsics_ref, extrinsics_ref, depth_src, intrinsics_src, + extrinsics_src) + # check |p_reproj-p_1| < 1 + dist = np.sqrt((x2d_reprojected - x_ref)**2 + (y2d_reprojected - y_ref)**2) + + # check |d_reproj-d_1| / d_1 < 0.01 + depth_diff = np.abs(depth_reprojected - depth_ref) + relative_depth_diff = depth_diff / depth_ref + + mask = np.logical_and(dist < 1, relative_depth_diff < 0.01) + depth_reprojected[~mask] = 0 + + return mask, depth_reprojected, x2d_src, y2d_src + + +def filter_depth(pair_folder, scan_folder, out_folder, thres_view): + # the pair file + pair_file = os.path.join(pair_folder, 'pair.txt') + # for the final point cloud + vertexs = [] + vertex_colors = [] + + pair_data = read_pair_file(pair_file) + + # for each reference view and the corresponding source views + for ref_view, src_views in pair_data: + # src_views = src_views[:args.num_view] + # load the camera parameters + ref_intrinsics, ref_extrinsics = read_camera_parameters( + os.path.join(scan_folder, 'cams/{:0>8}_cam.txt'.format(ref_view))) + # load the reference image + ref_img = read_img( + os.path.join(scan_folder, 'images/{:0>8}.jpg'.format(ref_view))) + # load the estimated depth of the reference view + ref_depth_est = read_pfm( + os.path.join(out_folder, + 'depth_est/{:0>8}.pfm'.format(ref_view)))[0] + # load the photometric mask of the reference view + confidence = read_pfm( + os.path.join(out_folder, + 'confidence/{:0>8}.pfm'.format(ref_view)))[0] + photo_mask = confidence > 0.4 + + all_srcview_depth_ests = [] + all_srcview_x = [] + all_srcview_y = [] + all_srcview_geomask = [] + + # compute the geometric mask + geo_mask_sum = 0 + for src_view in src_views: + # camera parameters of the source view + src_intrinsics, src_extrinsics = read_camera_parameters( + os.path.join(scan_folder, + 'cams/{:0>8}_cam.txt'.format(src_view))) + # the estimated depth of the source view + src_depth_est = read_pfm( + os.path.join(out_folder, + 'depth_est/{:0>8}.pfm'.format(src_view)))[0] + + geo_mask, depth_reprojected, x2d_src, y2d_src = check_geometric_consistency( + ref_depth_est, ref_intrinsics, ref_extrinsics, src_depth_est, + src_intrinsics, src_extrinsics) + geo_mask_sum += geo_mask.astype(np.int32) + all_srcview_depth_ests.append(depth_reprojected) + all_srcview_x.append(x2d_src) + all_srcview_y.append(y2d_src) + all_srcview_geomask.append(geo_mask) + + depth_est_averaged = (sum(all_srcview_depth_ests) + ref_depth_est) / ( + geo_mask_sum + 1) + # at least 3 source views matched + geo_mask = geo_mask_sum >= thres_view + final_mask = np.logical_and(photo_mask, geo_mask) + + os.makedirs(os.path.join(out_folder, 'mask'), exist_ok=True) + save_mask( + os.path.join(out_folder, 'mask/{:0>8}_photo.png'.format(ref_view)), + photo_mask) + save_mask( + os.path.join(out_folder, 'mask/{:0>8}_geo.png'.format(ref_view)), + geo_mask) + save_mask( + os.path.join(out_folder, 'mask/{:0>8}_final.png'.format(ref_view)), + final_mask) + + height, width = depth_est_averaged.shape[:2] + x, y = np.meshgrid(np.arange(0, width), np.arange(0, height)) + valid_points = final_mask + x, y, depth = x[valid_points], y[valid_points], depth_est_averaged[ + valid_points] + + color = ref_img[valid_points] + + xyz_ref = np.matmul( + np.linalg.inv(ref_intrinsics), + np.vstack((x, y, np.ones_like(x))) * depth) + xyz_world = np.matmul( + np.linalg.inv(ref_extrinsics), np.vstack( + (xyz_ref, np.ones_like(x))))[:3] + vertexs.append(xyz_world.transpose((1, 0))) + vertex_colors.append((color * 255).astype(np.uint8)) + + vertexs = np.concatenate(vertexs, axis=0) + vertex_colors = np.concatenate(vertex_colors, axis=0) + vertexs = np.array([tuple(v) for v in vertexs], + dtype=[('x', 'f4'), ('y', 'f4'), ('z', 'f4')]) + vertex_colors = np.array([tuple(v) for v in vertex_colors], + dtype=[('red', 'u1'), ('green', 'u1'), + ('blue', 'u1')]) + + vertex_all = np.empty( + len(vertexs), vertexs.dtype.descr + vertex_colors.dtype.descr) + for prop in vertexs.dtype.names: + vertex_all[prop] = vertexs[prop] + for prop in vertex_colors.dtype.names: + vertex_all[prop] = vertex_colors[prop] + + el = PlyElement.describe(vertex_all, 'vertex') + # PlyData([el]).write(plyfilename) + pcd = PlyData([el]) + + return pcd + + +def pcd_depth_filter(scene, test_dir, save_dir, thres_view): + old_scene_folder = os.path.join(test_dir, scene) + new_scene_folder = os.path.join(save_dir, scene) + out_folder = os.path.join(save_dir, scene) + pcd = filter_depth(old_scene_folder, new_scene_folder, out_folder, + thres_view) + return pcd diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/general_eval_dataset.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/general_eval_dataset.py new file mode 100644 index 000000000..0719d3fa0 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/general_eval_dataset.py @@ -0,0 +1,374 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import re +import sys + +import cv2 +import numpy as np +from PIL import Image +from torch.utils.data import Dataset +from torchvision import transforms + + +def read_pfm(filename): + file = open(filename, 'rb') + color = None + width = None + height = None + scale = None + endian = None + + header = file.readline().decode('utf-8').rstrip() + if header == 'PF': + color = True + elif header == 'Pf': + color = False + else: + raise Exception('Not a PFM file.') + + dim_match = re.match(r'^(\d+)\s(\d+)\s$', file.readline().decode('utf-8')) + if dim_match: + width, height = map(int, dim_match.groups()) + else: + raise Exception('Malformed PFM header.') + + scale = float(file.readline().rstrip()) + if scale < 0: # little-endian + endian = '<' + scale = -scale + else: + endian = '>' # big-endian + + data = np.fromfile(file, endian + 'f') + shape = (height, width, 3) if color else (height, width) + + data = np.reshape(data, shape) + data = np.flipud(data) + file.close() + return data, scale + + +def save_pfm(filename, image, scale=1): + file = open(filename, 'wb') + color = None + + image = np.flipud(image) + + if image.dtype.name != 'float32': + raise Exception('Image dtype must be float32.') + + if len(image.shape) == 3 and image.shape[2] == 3: # color image + color = True + elif len(image.shape) == 2 or len( + image.shape) == 3 and image.shape[2] == 1: # greyscale + color = False + else: + raise Exception( + 'Image must have H x W x 3, H x W x 1 or H x W dimensions.') + + file.write('PF\n'.encode('utf-8') if color else 'Pf\n'.encode('utf-8')) + file.write('{} {}\n'.format(image.shape[1], + image.shape[0]).encode('utf-8')) + + endian = image.dtype.byteorder + + if endian == '<' or endian == '=' and sys.byteorder == 'little': + scale = -scale + + file.write(('%f\n' % scale).encode('utf-8')) + + image.tofile(file) + file.close() + + +S_H, S_W = 0, 0 + + +class MVSDataset(Dataset): + + def __init__(self, root_dir, list_file, mode, n_views, **kwargs): + super(MVSDataset, self).__init__() + + self.root_dir = root_dir + self.list_file = list_file + self.mode = mode + self.n_views = n_views + + assert self.mode in ['train', 'val', 'test'] + + self.total_depths = 192 + self.interval_scale = 1.06 + + self.data_scale = kwargs.get('data_scale', 'mid') # mid / raw + self.robust_train = kwargs.get('robust_train', False) # True / False + self.color_augment = transforms.ColorJitter( + brightness=0.5, contrast=0.5) + + if self.mode == 'test': + self.max_wh = kwargs.get('max_wh', (1600, 1200)) + self.max_w, self.max_h = self.max_wh + + self.fix_res = kwargs.get( + 'fix_res', False) # whether to fix the resolution of input image. + self.fix_wh = False + + # self.metas = self.build_metas() + self.metas = self.build_list() + + def build_list(self): + metas = [] + scans = self.list_file + # logger.info("MVSDataset scans:", scans) + + interval_scale_dict = {} + # scans + for scan in scans: + # determine the interval scale of each scene. default is 1.06 + if isinstance(self.interval_scale, float): + interval_scale_dict[scan] = self.interval_scale + else: + interval_scale_dict[scan] = self.interval_scale[scan] + + pair_file = '{}/pair.txt'.format(scan) + # read the pair file + with open(os.path.join(self.root_dir, pair_file)) as f: + num_viewpoint = int(f.readline()) + # viewpoints + for view_idx in range(num_viewpoint): + ref_view = int(f.readline().rstrip()) + src_views = [ + int(x) for x in f.readline().rstrip().split()[1::2] + ] + # filter by no src view and fill to nviews + if len(src_views) > 0: + if len(src_views) < self.n_views: + src_views += [src_views[0]] * ( + self.n_views - len(src_views)) + metas.append((scan, ref_view, src_views, scan)) + + self.interval_scale = interval_scale_dict + return metas + + def __len__(self): + return len(self.metas) + + def read_cam_file(self, filename, interval_scale): + with open(filename) as f: + lines = f.readlines() + lines = [line.rstrip() for line in lines] + # extrinsics: line [1,5), 4x4 matrix + extrinsics = np.fromstring( + ' '.join(lines[1:5]), dtype=np.float32, sep=' ').reshape((4, 4)) + # intrinsics: line [7-10), 3x3 matrix + intrinsics = np.fromstring( + ' '.join(lines[7:10]), dtype=np.float32, sep=' ').reshape((3, 3)) + intrinsics[:2, :] /= 4.0 + # depth_min & depth_interval: line 11 + depth_min = float(lines[11].split()[0]) + depth_interval = float(lines[11].split()[1]) + + if len(lines[11].split()) >= 3: + num_depth = lines[11].split()[2] + depth_max = depth_min + int(float(num_depth)) * depth_interval + depth_interval = (depth_max - depth_min) / self.total_depths + + depth_interval *= interval_scale + + return intrinsics, extrinsics, depth_min, depth_interval + + def read_img(self, filename): + img = Image.open(filename) + if self.mode == 'train' and self.robust_train: + img = self.color_augment(img) + # scale 0~255 to 0~1 + np_img = np.array(img, dtype=np.float32) / 255. + return np_img + + def crop_img(self, img): + raw_h, raw_w = img.shape[:2] + start_h = (raw_h - 1024) // 2 + start_w = (raw_w - 1280) // 2 + return img[start_h:start_h + 1024, + start_w:start_w + 1280, :] # (1024, 1280) + + def prepare_img(self, hr_img): + h, w = hr_img.shape + if self.data_scale == 'mid': + hr_img_ds = cv2.resize( + hr_img, (w // 2, h // 2), interpolation=cv2.INTER_NEAREST) + h, w = hr_img_ds.shape + target_h, target_w = 512, 640 + start_h, start_w = (h - target_h) // 2, (w - target_w) // 2 + hr_img_crop = hr_img_ds[start_h:start_h + target_h, + start_w:start_w + target_w] + elif self.data_scale == 'raw': + hr_img_crop = hr_img[h // 2 - 1024 // 2:h // 2 + 1024 // 2, + w // 2 - 1280 // 2:w // 2 + + 1280 // 2] # (1024, 1280) + return hr_img_crop + + def scale_mvs_input(self, img, intrinsics, max_w, max_h, base=64): + h, w = img.shape[:2] + if h > max_h or w > max_w: + scale = 1.0 * max_h / h + if scale * w > max_w: + scale = 1.0 * max_w / w + new_w, new_h = scale * w // base * base, scale * h // base * base + else: + new_w, new_h = 1.0 * w // base * base, 1.0 * h // base * base + + scale_w = 1.0 * new_w / w + scale_h = 1.0 * new_h / h + intrinsics[0, :] *= scale_w + intrinsics[1, :] *= scale_h + + img = cv2.resize(img, (int(new_w), int(new_h))) + + return img, intrinsics + + def read_mask_hr(self, filename): + img = Image.open(filename) + np_img = np.array(img, dtype=np.float32) + np_img = (np_img > 10).astype(np.float32) + np_img = self.prepare_img(np_img) + + h, w = np_img.shape + np_img_ms = { + 'stage1': + cv2.resize( + np_img, (w // 8, h // 8), interpolation=cv2.INTER_NEAREST), + 'stage2': + cv2.resize( + np_img, (w // 4, h // 4), interpolation=cv2.INTER_NEAREST), + 'stage3': + cv2.resize( + np_img, (w // 2, h // 2), interpolation=cv2.INTER_NEAREST), + 'stage4': + np_img, + } + return np_img_ms + + def read_depth_hr(self, filename, scale): + depth_hr = np.array(read_pfm(filename)[0], dtype=np.float32) * scale + depth_lr = self.prepare_img(depth_hr) + + h, w = depth_lr.shape + depth_lr_ms = { + 'stage1': + cv2.resize( + depth_lr, (w // 8, h // 8), interpolation=cv2.INTER_NEAREST), + 'stage2': + cv2.resize( + depth_lr, (w // 4, h // 4), interpolation=cv2.INTER_NEAREST), + 'stage3': + cv2.resize( + depth_lr, (w // 2, h // 2), interpolation=cv2.INTER_NEAREST), + 'stage4': + depth_lr, + } + return depth_lr_ms + + def __getitem__(self, idx): + global S_H, S_W + meta = self.metas[idx] + scan, ref_view, src_views, scene_name = meta + # use only the reference view and first nviews-1 source views + view_ids = [ref_view] + src_views[:self.n_views - 1] + + scale_ratio = 1 + + imgs = [] + depth_values = None + proj_matrices = [] + + for i, vid in enumerate(view_ids): + img_filename = os.path.join( + self.root_dir, '{}/images_post/{:0>8}.jpg'.format(scan, vid)) + if not os.path.exists(img_filename): + img_filename = os.path.join( + self.root_dir, '{}/images/{:0>8}.jpg'.format(scan, vid)) + + proj_mat_filename = os.path.join( + self.root_dir, '{}/cams/{:0>8}_cam.txt'.format(scan, vid)) + + img = self.read_img(img_filename) + intrinsics, extrinsics, depth_min, depth_interval = self.read_cam_file( + proj_mat_filename, + interval_scale=self.interval_scale[scene_name]) + # scale input + img, intrinsics = self.scale_mvs_input(img, intrinsics, self.max_w, + self.max_h) + + if self.fix_res: + # using the same standard height or width in entire scene. + S_H, S_W = img.shape[:2] + self.fix_res = False + self.fix_wh = True + + if i == 0: + if not self.fix_wh: + # using the same standard height or width in each nviews. + S_H, S_W = img.shape[:2] + + # resize to standard height or width + c_h, c_w = img.shape[:2] + if (c_h != S_H) or (c_w != S_W): + scale_h = 1.0 * S_H / c_h + scale_w = 1.0 * S_W / c_w + img = cv2.resize(img, (S_W, S_H)) + intrinsics[0, :] *= scale_w + intrinsics[1, :] *= scale_h + + ################# + imgs.append(img.transpose(2, 0, 1)) + + # reference view + if i == 0: + # @Note depth values + diff = 0.5 if self.mode in ['test', 'val'] else 0 + depth_max = depth_interval * (self.total_depths + - diff) + depth_min + depth_values = np.array( + [depth_min * scale_ratio, depth_max * scale_ratio], + dtype=np.float32) + + proj_mat = np.zeros(shape=(2, 4, 4), dtype=np.float32) + proj_mat[0, :4, :4] = extrinsics + proj_mat[1, :3, :3] = intrinsics + proj_matrices.append(proj_mat) + + proj_matrices = np.stack(proj_matrices) + intrinsics = np.stack(intrinsics) + stage1_pjmats = proj_matrices.copy() + stage1_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] / 2.0 + stage1_ins = intrinsics.copy() + stage1_ins[:2, :] = intrinsics[:2, :] / 2.0 + stage3_pjmats = proj_matrices.copy() + stage3_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] * 2 + stage3_ins = intrinsics.copy() + stage3_ins[:2, :] = intrinsics[:2, :] * 2.0 + stage4_pjmats = proj_matrices.copy() + stage4_pjmats[:, 1, :2, :] = proj_matrices[:, 1, :2, :] * 4 + stage4_ins = intrinsics.copy() + stage4_ins[:2, :] = intrinsics[:2, :] * 4.0 + proj_matrices = { + 'stage1': stage1_pjmats, + 'stage2': proj_matrices, + 'stage3': stage3_pjmats, + 'stage4': stage4_pjmats + } + intrinsics_matrices = { + 'stage1': stage1_ins, + 'stage2': intrinsics, + 'stage3': stage3_ins, + 'stage4': stage4_ins + } + + sample = { + 'imgs': imgs, + 'proj_matrices': proj_matrices, + 'intrinsics_matrices': intrinsics_matrices, + 'depth_values': depth_values, + 'filename': scan + '/{}/' + '{:0>8}'.format(view_ids[0]) + '{}' + } + return sample diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/geomvsnet_model.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/geomvsnet_model.py new file mode 100644 index 000000000..0777945af --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/geomvsnet_model.py @@ -0,0 +1,196 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +import os.path as osp +import time + +import cv2 +import numpy as np +import torch +from easydict import EasyDict as edict +from torch.utils.data import DataLoader + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .colmap2mvsnet import processing_single_scene +from .depth_filter import pcd_depth_filter +from .general_eval_dataset import MVSDataset, save_pfm +from .models.geomvsnet import GeoMVSNet +from .models.utils import * +from .models.utils.opts import get_opts +from .utils import (generate_pointcloud, numpy2torch, tensor2numpy, tocuda, + write_cam) + +logger = get_logger() + + +@MODELS.register_module( + Tasks.image_multi_view_depth_estimation, + module_name=Models.image_geomvsnet_depth_estimation) +class GeoMVSNetDepthEstimation(TorchModel): + ''' + GeoMVSNet is a state-of-the-art MVS(multi-view stereo) depth estimation method. + For more details, please refer to https://github.com/doublez0108/geomvsnet + ''' + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + self.n_views = 5 + self.levels = 4 + self.hypo_plane_num_stages = '8,8,4,4' + self.depth_interal_ratio_stages = '0.5,0.5,0.5,1' + self.feat_base_channel = 8 + self.reg_base_channel = 8 + self.group_cor_dim_stages = '8,8,4,4' + self.batch_size = 1 + + self.model = GeoMVSNet( + levels=self.levels, + hypo_plane_num_stages=[ + int(n) for n in self.hypo_plane_num_stages.split(',') + ], + depth_interal_ratio_stages=[ + float(ir) for ir in self.depth_interal_ratio_stages.split(',') + ], + feat_base_channel=self.feat_base_channel, + reg_base_channel=self.reg_base_channel, + group_cor_dim_stages=[ + int(n) for n in self.group_cor_dim_stages.split(',') + ], + ) + + # load checkpoint file + ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + logger.info(f'loading model {ckpt_path}') + state_dict = torch.load(ckpt_path, map_location=torch.device('cpu')) + self.model.load_state_dict(state_dict['model'], strict=False) + + if torch.cuda.is_available(): + self.device = 'cuda' + else: + self.device = 'cpu' + + self.model.to(self.device) + self.model.eval() + logger.info(f'model init done! Device:{self.device}') + + def preprocess_make_pair(self, inputs): + + data = inputs['input_dir'] + casmvs_inp_dir = inputs['casmvs_inp_dir'] + + args = edict() + args.dense_folder = data + args.save_folder = casmvs_inp_dir + args.max_d = 192 + args.interval_scale = 1.06 + args.theta0 = 5 + args.sigma1 = 1 + args.sigma2 = 10 + args.model_ext = '.bin' + + logger.info('preprocess of making pair data start, folder: %s', + args.dense_folder) + processing_single_scene(args) + logger.info('preprocess of making pair data done') + + def forward(self, inputs): + + test_dir = os.path.dirname(inputs['casmvs_inp_dir']) + scene = os.path.basename(inputs['casmvs_inp_dir']) + test_list = [scene] + save_dir = inputs['casmvs_res_dir'] + + logger.info('depth estimation start') + + test_dataset = MVSDataset( + test_dir, test_list, 'test', self.n_views, max_wh=(1600, 1200)) + TestImgLoader = DataLoader( + test_dataset, + self.batch_size, + shuffle=False, + num_workers=4, + drop_last=False) + + total_time = 0 + with torch.no_grad(): + for batch_idx, sample in enumerate(TestImgLoader): + sample_cuda = tocuda(sample) + + # @Note GeoMVSNet main + start_time = time.time() + outputs = self.model(sample_cuda['imgs'], + sample_cuda['proj_matrices'], + sample_cuda['intrinsics_matrices'], + sample_cuda['depth_values'], + sample['filename']) + end_time = time.time() + total_time += end_time - start_time + + outputs = tensor2numpy(outputs) + del sample_cuda + filenames = sample['filename'] + cams = sample['proj_matrices']['stage{}'.format( + self.levels)].numpy() + imgs = sample['imgs'] + logger.info('Iter {}/{}, Time:{:.3f} Res:{}'.format( + batch_idx, len(TestImgLoader), end_time - start_time, + imgs[0].shape)) + + for filename, cam, img, depth_est, photometric_confidence in zip( + filenames, cams, imgs, outputs['depth'], + outputs['photometric_confidence']): + img = img[0].numpy() # ref view + cam = cam[0] # ref cam + + depth_filename = os.path.join( + save_dir, filename.format('depth_est', '.pfm')) + confidence_filename = os.path.join( + save_dir, filename.format('confidence', '.pfm')) + cam_filename = os.path.join( + save_dir, filename.format('cams', '_cam.txt')) + img_filename = os.path.join( + save_dir, filename.format('images', '.jpg')) + os.makedirs( + depth_filename.rsplit('/', 1)[0], exist_ok=True) + os.makedirs( + confidence_filename.rsplit('/', 1)[0], exist_ok=True) + os.makedirs(cam_filename.rsplit('/', 1)[0], exist_ok=True) + os.makedirs(img_filename.rsplit('/', 1)[0], exist_ok=True) + + # save depth maps + save_pfm(depth_filename, depth_est) + + # save confidence maps + confidence_list = [ + outputs['stage{}'.format(i)] + ['photometric_confidence'].squeeze(0) + for i in range(1, self.levels + 1) + ] + print('confidence_list', len(confidence_list)) + photometric_confidence = confidence_list[-1] + save_pfm(confidence_filename, photometric_confidence) + + # save camera info + write_cam(cam_filename, cam) + img = np.clip(np.transpose(img, (1, 2, 0)) * 255, 0, + 255).astype(np.uint8) + img_bgr = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + cv2.imwrite(img_filename, img_bgr) + + torch.cuda.empty_cache() + logger.info('depth estimation end') + return inputs + + def postprocess(self, inputs): + test_dir = os.path.dirname(inputs['casmvs_inp_dir']) + scene = os.path.basename(inputs['casmvs_inp_dir']) + logger.info('depth fusion start') + pcd = pcd_depth_filter( + scene, test_dir, inputs['casmvs_res_dir'], thres_view=4) + logger.info('depth fusion end') + return pcd diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/__init__.py new file mode 100644 index 000000000..4f29d642e --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/__init__.py @@ -0,0 +1,2 @@ +from .geomvsnet import GeoMVSNet +from .loss import geomvsnet_loss diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/filter.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/filter.py new file mode 100644 index 000000000..9482ebace --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/filter.py @@ -0,0 +1,38 @@ +# @Description: Basic implementation of Frequency Domain Filtering strategy (Sec 3.2 in the paper). +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import matplotlib.pyplot as plt +import numpy as np +import torch + + +def frequency_domain_filter(depth, rho_ratio): + """ + large rho_ratio -> more information filtered + """ + f = torch.fft.fft2(depth) + fshift = torch.fft.fftshift(f) + + b, h, w = depth.shape + k_h, k_w = h / rho_ratio, w / rho_ratio + + fshift[:, :int(h / 2 - k_h / 2), :] = 0 + fshift[:, int(h / 2 + k_h / 2):, :] = 0 + fshift[:, :, :int(w / 2 - k_w / 2)] = 0 + fshift[:, :, int(w / 2 + k_w / 2):] = 0 + + ishift = torch.fft.ifftshift(fshift) + idepth = torch.fft.ifft2(ishift) + depth_filtered = torch.abs(idepth) + + return depth_filtered + + +def visual_fft_fig(fshift): + fft_fig = torch.abs(20 * torch.log(fshift)) + plt.figure(figsize=(10, 10)) + plt.subplot(121) + plt.imshow(fft_fig[0, :, :], cmap='gray') diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geometry.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geometry.py new file mode 100644 index 000000000..f108b05cc --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geometry.py @@ -0,0 +1,856 @@ +# @Description: Geometric Prior Guided Feature Fusion & Probability Volume Geometry Embedding (Sec 3.1 in the paper). +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .submodules import ConvBnReLU3D + + +class GeoFeatureFusion(nn.Module): + + def __init__(self, + convolutional_layer_encoding='z', + mask_type='basic', + add_origin_feat_flag=True): + super(GeoFeatureFusion, self).__init__() + + self.convolutional_layer_encoding = convolutional_layer_encoding # std / uv / z / xyz + self.mask_type = mask_type # basic / mean + self.add_origin_feat_flag = add_origin_feat_flag # True / False + + if self.convolutional_layer_encoding == 'std': + self.geoplanes = 0 + elif self.convolutional_layer_encoding == 'uv': + self.geoplanes = 2 + elif self.convolutional_layer_encoding == 'z': + self.geoplanes = 1 + elif self.convolutional_layer_encoding == 'xyz': + self.geoplanes = 3 + self.geofeature = GeometryFeature() + + # rgb encoder + self.rgb_conv_init = convbnrelu( + in_channels=4, out_channels=8, kernel_size=5, stride=1, padding=2) + + self.rgb_encoder_layer1 = BasicBlockGeo( + inplanes=8, planes=16, stride=2, geoplanes=self.geoplanes) + self.rgb_encoder_layer2 = BasicBlockGeo( + inplanes=16, planes=32, stride=1, geoplanes=self.geoplanes) + self.rgb_encoder_layer3 = BasicBlockGeo( + inplanes=32, planes=64, stride=2, geoplanes=self.geoplanes) + self.rgb_encoder_layer4 = BasicBlockGeo( + inplanes=64, planes=128, stride=1, geoplanes=self.geoplanes) + self.rgb_encoder_layer5 = BasicBlockGeo( + inplanes=128, planes=256, stride=2, geoplanes=self.geoplanes) + + self.rgb_decoder_layer4 = deconvbnrelu( + in_channels=256, + out_channels=128, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.rgb_decoder_layer2 = deconvbnrelu( + in_channels=128, + out_channels=32, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.rgb_decoder_layer0 = deconvbnrelu( + in_channels=32, + out_channels=16, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + self.rgb_decoder_layer = deconvbnrelu( + in_channels=16, + out_channels=8, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.rgb_decoder_output = deconvbnrelu( + in_channels=8, + out_channels=2, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + + # depth encoder + self.depth_conv_init = convbnrelu( + in_channels=2, out_channels=8, kernel_size=5, stride=1, padding=2) + + self.depth_layer1 = BasicBlockGeo( + inplanes=8, planes=16, stride=2, geoplanes=self.geoplanes) + self.depth_layer2 = BasicBlockGeo( + inplanes=16, planes=32, stride=1, geoplanes=self.geoplanes) + self.depth_layer3 = BasicBlockGeo( + inplanes=64, planes=64, stride=2, geoplanes=self.geoplanes) + self.depth_layer4 = BasicBlockGeo( + inplanes=64, planes=128, stride=1, geoplanes=self.geoplanes) + self.depth_layer5 = BasicBlockGeo( + inplanes=256, planes=256, stride=2, geoplanes=self.geoplanes) + + self.decoder_layer3 = deconvbnrelu( + in_channels=256, + out_channels=128, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.decoder_layer4 = deconvbnrelu( + in_channels=128, + out_channels=64, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + self.decoder_layer5 = deconvbnrelu( + in_channels=64, + out_channels=32, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.decoder_layer6 = deconvbnrelu( + in_channels=32, + out_channels=16, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + self.decoder_layer7 = deconvbnrelu( + in_channels=16, + out_channels=8, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + + # output + self.rgbdepth_decoder_stage1 = deconvbnrelu( + in_channels=32, + out_channels=32, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.rgbdepth_decoder_stage2 = deconvbnrelu( + in_channels=16, + out_channels=16, + kernel_size=5, + stride=2, + padding=2, + output_padding=1) + self.rgbdepth_decoder_stage3 = deconvbnrelu( + in_channels=8, + out_channels=8, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + + self.final_decoder_stage1 = deconvbnrelu( + in_channels=32, + out_channels=32, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + self.final_decoder_stage2 = deconvbnrelu( + in_channels=16, + out_channels=16, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + self.final_decoder_stage3 = deconvbnrelu( + in_channels=8, + out_channels=8, + kernel_size=3, + stride=1, + padding=1, + output_padding=0) + + self.softmax = nn.Softmax(dim=1) + self.pooling = nn.AvgPool2d(kernel_size=2) + self.sparsepooling = SparseDownSampleClose(stride=2) + + weights_init(self) + + def forward(self, rgb, depth, confidence, depth_values, stage_idx, + origin_feat, intrinsics_matrices_stage): + + rgb = rgb + depth_min, depth_max = depth_values[:, 0, None, None, + None], depth_values[:, -1, None, + None, None] + d = (depth - depth_min) / (depth_max - depth_min) + + if self.mask_type == 'basic': + valid_mask = torch.where(d > 0, torch.full_like(d, 1.0), + torch.full_like(d, 0.0)) + elif self.mask_type == 'mean': + valid_mask = torch.where( + torch.logical_and(d > 0, confidence > confidence.mean()), + torch.full_like(d, 1.0), torch.full_like(d, 0.0)) + + # pre-data preparation + if self.convolutional_layer_encoding in ['uv', 'xyz']: + B, _, W, H = rgb.shape + position = AddCoordsNp(H, W) + position = position.call() + position = torch.from_numpy(position).to(rgb.device).repeat( + B, 1, 1, 1).transpose(-1, 1) + unorm = position[:, 0:1, :, :] + vnorm = position[:, 1:2, :, :] + + vnorm_s2 = self.pooling(vnorm) + vnorm_s3 = self.pooling(vnorm_s2) + vnorm_s4 = self.pooling(vnorm_s3) + + unorm_s2 = self.pooling(unorm) + unorm_s3 = self.pooling(unorm_s2) + unorm_s4 = self.pooling(unorm_s3) + + if self.convolutional_layer_encoding in ['z', 'xyz']: + d_s2, vm_s2 = self.sparsepooling(d, valid_mask) + d_s3, vm_s3 = self.sparsepooling(d_s2, vm_s2) + d_s4, vm_s4 = self.sparsepooling(d_s3, vm_s3) + + if self.convolutional_layer_encoding == 'xyz': + K = intrinsics_matrices_stage + f352 = K[:, 1, 1] + f352 = f352.unsqueeze(1) + f352 = f352.unsqueeze(2) + f352 = f352.unsqueeze(3) + c352 = K[:, 1, 2] + c352 = c352.unsqueeze(1) + c352 = c352.unsqueeze(2) + c352 = c352.unsqueeze(3) + f1216 = K[:, 0, 0] + f1216 = f1216.unsqueeze(1) + f1216 = f1216.unsqueeze(2) + f1216 = f1216.unsqueeze(3) + c1216 = K[:, 0, 2] + c1216 = c1216.unsqueeze(1) + c1216 = c1216.unsqueeze(2) + c1216 = c1216.unsqueeze(3) + + # geometric info + if self.convolutional_layer_encoding == 'std': + geo_s1 = None + geo_s2 = None + geo_s3 = None + geo_s4 = None + elif self.convolutional_layer_encoding == 'uv': + geo_s1 = torch.cat((vnorm, unorm), dim=1) + geo_s2 = torch.cat((vnorm_s2, unorm_s2), dim=1) + geo_s3 = torch.cat((vnorm_s3, unorm_s3), dim=1) + geo_s4 = torch.cat((vnorm_s4, unorm_s4), dim=1) + elif self.convolutional_layer_encoding == 'z': + geo_s1 = d + geo_s2 = d_s2 + geo_s3 = d_s3 + geo_s4 = d_s4 + elif self.convolutional_layer_encoding == 'xyz': + geo_s1 = self.geofeature(d, vnorm, unorm, H, W, c352, c1216, f352, + f1216) + geo_s2 = self.geofeature(d_s2, vnorm_s2, unorm_s2, H / 2, W / 2, + c352, c1216, f352, f1216) + geo_s3 = self.geofeature(d_s3, vnorm_s3, unorm_s3, H / 4, W / 4, + c352, c1216, f352, f1216) + geo_s4 = self.geofeature(d_s4, vnorm_s4, unorm_s4, H / 8, W / 8, + c352, c1216, f352, f1216) + + # ----------------------------------------------------------------------------------------- + + # 128*160 -> 256*320 -> 512*640 + rgb_feature = self.rgb_conv_init(torch.cat((rgb, d), dim=1)) # b 8 h w + rgb_feature1 = self.rgb_encoder_layer1(rgb_feature, geo_s1, + geo_s2) # b 16 h/2 w/2 + rgb_feature2 = self.rgb_encoder_layer2(rgb_feature1, geo_s2, + geo_s2) # b 32 h/2 w/2 + rgb_feature3 = self.rgb_encoder_layer3(rgb_feature2, geo_s2, + geo_s3) # b 64 h/4 w/4 + rgb_feature4 = self.rgb_encoder_layer4(rgb_feature3, geo_s3, + geo_s3) # b 128 h/4 w/4 + rgb_feature5 = self.rgb_encoder_layer5(rgb_feature4, geo_s3, + geo_s4) # b 256 h/8 w/8 + + rgb_feature_decoder4 = self.rgb_decoder_layer4(rgb_feature5) + rgb_feature4_plus = rgb_feature_decoder4 + rgb_feature4 # b 128 h/4 w/4 + + rgb_feature_decoder2 = self.rgb_decoder_layer2(rgb_feature4_plus) + rgb_feature2_plus = rgb_feature_decoder2 + rgb_feature2 # b 32 h/2 w/2 + + rgb_feature_decoder0 = self.rgb_decoder_layer0(rgb_feature2_plus) + rgb_feature0_plus = rgb_feature_decoder0 + rgb_feature1 # b 16 h/2 w/2 + + rgb_feature_decoder = self.rgb_decoder_layer(rgb_feature0_plus) + rgb_feature_plus = rgb_feature_decoder + rgb_feature # b 8 h w + + rgb_output = self.rgb_decoder_output(rgb_feature_plus) # b 2 h w + + rgb_depth = rgb_output[:, 0:1, :, :] + # rgb_conf = rgb_output[:, 1:2, :, :] + + # ----------------------------------------------------------------------------------------- + + sparsed_feature = self.depth_conv_init( + torch.cat((d, rgb_depth), dim=1)) # b 8 h w + sparsed_feature1 = self.depth_layer1(sparsed_feature, geo_s1, + geo_s2) # b 16 h/2 w/2 + sparsed_feature2 = self.depth_layer2(sparsed_feature1, geo_s2, + geo_s2) # b 32 h/2 w/2 + + sparsed_feature2_plus = torch.cat( + [rgb_feature2_plus, sparsed_feature2], 1) + sparsed_feature3 = self.depth_layer3(sparsed_feature2_plus, geo_s2, + geo_s3) # b 64 h/4 w/4 + sparsed_feature4 = self.depth_layer4(sparsed_feature3, geo_s3, + geo_s3) # b 128 h/4 w/4 + + sparsed_feature4_plus = torch.cat( + [rgb_feature4_plus, sparsed_feature4], 1) + sparsed_feature5 = self.depth_layer5(sparsed_feature4_plus, geo_s3, + geo_s4) # b 256 h/8 w/8 + + # ----------------------------------------------------------------------------------------- + + fusion3 = rgb_feature5 + sparsed_feature5 + decoder_feature3 = self.decoder_layer3(fusion3) # b 128 h/4 w/4 + + fusion4 = sparsed_feature4 + decoder_feature3 + decoder_feature4 = self.decoder_layer4(fusion4) # b 64 h/4 w/4 + + if stage_idx >= 1: + decoder_feature5 = self.decoder_layer5(decoder_feature4) + fusion5 = sparsed_feature2 + decoder_feature5 # b 32 h/2 w/2 + if stage_idx == 1: + rgbdepth_feature = self.rgbdepth_decoder_stage1(fusion5) + if self.add_origin_feat_flag: + final_feature = self.final_decoder_stage1(rgbdepth_feature + + origin_feat) + else: + final_feature = self.final_decoder_stage1(rgbdepth_feature) + + if stage_idx >= 2: + decoder_feature6 = self.decoder_layer6(decoder_feature5) + fusion6 = sparsed_feature1 + decoder_feature6 # b 16 h/2 w/2 + if stage_idx == 2: + rgbdepth_feature = self.rgbdepth_decoder_stage2(fusion6) + if self.add_origin_feat_flag: + final_feature = self.final_decoder_stage2(rgbdepth_feature + + origin_feat) + else: + final_feature = self.final_decoder_stage2(rgbdepth_feature) + + if stage_idx >= 3: + decoder_feature7 = self.decoder_layer7(decoder_feature6) + fusion7 = sparsed_feature + decoder_feature7 # b 8 h w + if stage_idx == 3: + rgbdepth_feature = self.rgbdepth_decoder_stage3(fusion7) + if self.add_origin_feat_flag: + final_feature = self.final_decoder_stage3(rgbdepth_feature + + origin_feat) + else: + final_feature = self.final_decoder_stage3(rgbdepth_feature) + + return final_feature + + +class GeoRegNet2d(nn.Module): + + def __init__(self, + input_channel=128, + base_channel=32, + convolutional_layer_encoding='std'): + super(GeoRegNet2d, self).__init__() + + self.convolutional_layer_encoding = convolutional_layer_encoding # std / uv / z / xyz + self.mask_type = 'basic' # basic / mean + + if self.convolutional_layer_encoding == 'std': + self.geoplanes = 0 + elif self.convolutional_layer_encoding == 'z': + self.geoplanes = 1 + + self.conv_init = ConvBnReLU3D( + input_channel, + out_channels=8, + kernel_size=(1, 3, 3), + pad=(0, 1, 1)) + self.encoder_layer1 = Reg_BasicBlockGeo( + inplanes=8, + planes=16, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + padding=(0, 1, 1), + geoplanes=self.geoplanes) + self.encoder_layer2 = Reg_BasicBlockGeo( + inplanes=16, + planes=32, + kernel_size=(1, 3, 3), + stride=1, + padding=(0, 1, 1), + geoplanes=self.geoplanes) + self.encoder_layer3 = Reg_BasicBlockGeo( + inplanes=32, + planes=64, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + padding=(0, 1, 1), + geoplanes=self.geoplanes) + self.encoder_layer4 = Reg_BasicBlockGeo( + inplanes=64, + planes=128, + kernel_size=(1, 3, 3), + stride=1, + padding=(0, 1, 1), + geoplanes=self.geoplanes) + self.encoder_layer5 = Reg_BasicBlockGeo( + inplanes=128, + planes=256, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + padding=(0, 1, 1), + geoplanes=self.geoplanes) + + self.decoder_layer4 = reg_deconvbnrelu( + in_channels=256, + out_channels=128, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + output_padding=(0, 1, 1)) + self.decoder_layer3 = reg_deconvbnrelu( + in_channels=128, + out_channels=64, + kernel_size=(1, 3, 3), + stride=1, + padding=(0, 1, 1), + output_padding=0) + self.decoder_layer2 = reg_deconvbnrelu( + in_channels=64, + out_channels=32, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + output_padding=(0, 1, 1)) + self.decoder_layer1 = reg_deconvbnrelu( + in_channels=32, + out_channels=16, + kernel_size=(1, 3, 3), + stride=1, + padding=(0, 1, 1), + output_padding=0) + self.decoder_layer = reg_deconvbnrelu( + in_channels=16, + out_channels=8, + kernel_size=(1, 5, 5), + stride=(1, 2, 2), + padding=(0, 2, 2), + output_padding=(0, 1, 1)) + + self.prob = reg_deconvbnrelu( + in_channels=8, + out_channels=1, + kernel_size=(1, 3, 3), + stride=1, + padding=(0, 1, 1), + output_padding=0) + + self.depthpooling = nn.MaxPool3d((2, 1, 1), (2, 1, 1)) + self.basicpooling = nn.MaxPool3d((1, 2, 2), (1, 2, 2)) + + weights_init(self) + + def forward(self, x, stage_idx, geo_reg_data=None): + + B, C, D, W, H = x.shape + + if stage_idx >= 1 and self.convolutional_layer_encoding == 'z': + prob_volume = geo_reg_data['prob_volume_last'].unsqueeze( + 1) # B 1 D H W + else: + assert self.convolutional_layer_encoding == 'std' + + # geometric info + if self.convolutional_layer_encoding == 'std': + geo_s1 = None + geo_s2 = None + geo_s3 = None + # geo_s4 = None + elif self.convolutional_layer_encoding == 'z': + if stage_idx == 2: + geo_s1 = self.depthpooling(prob_volume) + else: + geo_s1 = prob_volume # B 1 D H W + geo_s2 = self.basicpooling(geo_s1) + geo_s3 = self.basicpooling(geo_s2) + + feature = self.conv_init(x) # B 8 D H W + feature1 = self.encoder_layer1(feature, geo_s1, + geo_s1) # B 16 D H/2 W/2 + feature2 = self.encoder_layer2(feature1, geo_s2, + geo_s2) # B 32 D H/2 W/2 + feature3 = self.encoder_layer3(feature2, geo_s2, + geo_s2) # B 64 D H/4 W/4 + feature4 = self.encoder_layer4(feature3, geo_s3, + geo_s3) # B 128 D H/4 W/4 + feature5 = self.encoder_layer5(feature4, geo_s3, + geo_s3) # B 256 D H/8 W/8 + + feature_decoder4 = self.decoder_layer4(feature5) + feature4_plus = feature_decoder4 + feature4 # B 128 D H/4 W/4 + + feature_decoder3 = self.decoder_layer3(feature4_plus) + feature3_plus = feature_decoder3 + feature3 # B 64 D H/4 W/4 + + feature_decoder2 = self.decoder_layer2(feature3_plus) + feature2_plus = feature_decoder2 + feature2 # B 32 D H/2 W/2 + + feature_decoder1 = self.decoder_layer1(feature2_plus) + feature1_plus = feature_decoder1 + feature1 # B 16 D H/2 W/2 + + feature_decoder = self.decoder_layer(feature1_plus) + feature_plus = feature_decoder + feature # B 8 D H W + + x = self.prob(feature_plus) + + return x.squeeze(1) + + +# -------------------------------------------------------------- + + +class BasicBlockGeo(nn.Module): + expansion = 1 + __constants__ = ['downsample'] + + def __init__(self, + inplanes, + planes, + stride=1, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=None, + geoplanes=3): + super(BasicBlockGeo, self).__init__() + + if norm_layer is None: + norm_layer = nn.BatchNorm2d + + if groups != 1 or base_width != 64: + raise ValueError( + 'BasicBlock only supports groups=1 and base_width=64') + if dilation > 1: + raise NotImplementedError( + 'Dilation > 1 not supported in BasicBlock') + + self.conv1 = conv3x3(inplanes + geoplanes, planes, stride) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes + geoplanes, planes) + self.bn2 = norm_layer(planes) + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + conv1x1(inplanes + geoplanes, planes, stride), + norm_layer(planes), + ) + self.downsample = downsample + self.stride = stride + + def forward(self, x, g1=None, g2=None): + identity = x + if g1 is not None: + x = torch.cat((x, g1), 1) + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + if g2 is not None: + out = torch.cat((g2, out), 1) + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +class GeometryFeature(nn.Module): + + def __init__(self): + super(GeometryFeature, self).__init__() + + def forward(self, z, vnorm, unorm, h, w, ch, cw, fh, fw): + x = z * (0.5 * h * (vnorm + 1) - ch) / fh + y = z * (0.5 * w * (unorm + 1) - cw) / fw + return torch.cat((x, y, z), 1) + + +class SparseDownSampleClose(nn.Module): + + def __init__(self, stride): + super(SparseDownSampleClose, self).__init__() + self.pooling = nn.MaxPool2d(stride, stride) + self.large_number = 600 + + def forward(self, d, mask): + encode_d = -(1 - mask) * self.large_number - d + + d = -self.pooling(encode_d) + mask_result = self.pooling(mask) + d_result = d - (1 - mask_result) * self.large_number + + return d_result, mask_result + + +def convbnrelu(in_channels, out_channels, kernel_size=3, stride=1, padding=1): + return nn.Sequential( + nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True)) + + +def deconvbnrelu(in_channels, + out_channels, + kernel_size=5, + stride=2, + padding=2, + output_padding=1): + return nn.Sequential( + nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + bias=False), nn.BatchNorm2d(out_channels), nn.ReLU(inplace=True)) + + +def weights_init(m): + """Initialize filters with Gaussian random weights""" + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.ConvTranspose2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.in_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + +def conv3x3(in_planes, + out_planes, + stride=1, + groups=1, + dilation=1, + bias=False, + padding=1): + """3x3 convolution with padding""" + if padding >= 1: + padding = dilation + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=padding, + groups=groups, + bias=bias, + dilation=dilation) + + +def conv1x1(in_planes, out_planes, stride=1, groups=1, bias=False): + """1x1 convolution""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=1, + stride=stride, + groups=groups, + bias=bias) + + +class AddCoordsNp(): + """Add coords to a tensor""" + + def __init__(self, x_dim=64, y_dim=64, with_r=False): + self.x_dim = x_dim + self.y_dim = y_dim + self.with_r = with_r + + def call(self): + """ + input_tensor: (batch, x_dim, y_dim, c) + """ + xx_ones = np.ones([self.x_dim], dtype=np.int32) + xx_ones = np.expand_dims(xx_ones, 1) + + xx_range = np.expand_dims(np.arange(self.y_dim), 0) + + xx_channel = np.matmul(xx_ones, xx_range) + xx_channel = np.expand_dims(xx_channel, -1) + + yy_ones = np.ones([self.y_dim], dtype=np.int32) + yy_ones = np.expand_dims(yy_ones, 0) + + yy_range = np.expand_dims(np.arange(self.x_dim), 1) + + yy_channel = np.matmul(yy_range, yy_ones) + yy_channel = np.expand_dims(yy_channel, -1) + + xx_channel = xx_channel.astype('float32') / (self.y_dim - 1) + yy_channel = yy_channel.astype('float32') / (self.x_dim - 1) + + xx_channel = xx_channel * 2 - 1 + yy_channel = yy_channel * 2 - 1 + + ret = np.concatenate([xx_channel, yy_channel], axis=-1) + + if self.with_r: + rr = np.sqrt( + np.square(xx_channel - 0.5) + np.square(yy_channel - 0.5)) + ret = np.concatenate([ret, rr], axis=-1) + + return ret + + +# -------------------------------------------------------------- + + +class Reg_BasicBlockGeo(nn.Module): + + def __init__(self, + inplanes, + planes, + kernel_size, + stride, + padding, + downsample=None, + groups=1, + base_width=64, + dilation=1, + norm_layer=nn.BatchNorm3d, + geoplanes=3): + super(Reg_BasicBlockGeo, self).__init__() + + self.conv1 = regconv3D( + inplanes + geoplanes, + planes, + kernel_size=(1, 3, 3), + stride=1, + padding=(0, 1, 1)) + self.bn1 = norm_layer(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = regconv3D(planes + geoplanes, planes, kernel_size, stride, + padding) + self.bn2 = norm_layer(planes) + if stride != 1 or inplanes != planes: + downsample = nn.Sequential( + regconv1x1(inplanes + geoplanes, planes, kernel_size, stride, + padding), + norm_layer(planes), + ) + self.downsample = downsample + self.stride = stride + + def forward(self, x, g1=None, g2=None): + identity = x + if g1 is not None: + x = torch.cat((x, g1), 1) + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + if g2 is not None: + out = torch.cat((g2, out), 1) + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + + return out + + +def regconv3D(in_planes, + out_planes, + kernel_size, + stride, + padding, + groups=1, + dilation=1, + bias=False): + return nn.Conv3d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=bias, + dilation=dilation) + + +def regconv1x1(in_planes, + out_planes, + kernel_size, + stride, + padding, + groups=1, + bias=False): + return nn.Conv3d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + bias=bias) + + +def reg_deconvbnrelu(in_channels, out_channels, kernel_size, stride, padding, + output_padding): + return nn.Sequential( + nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + output_padding=output_padding, + bias=False), nn.BatchNorm3d(out_channels), nn.ReLU(inplace=True)) diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geomvsnet.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geomvsnet.py new file mode 100644 index 000000000..965401d75 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/geomvsnet.py @@ -0,0 +1,267 @@ +# @Description: Main network architecture for GeoMVSNet. +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .filter import frequency_domain_filter +from .geometry import GeoFeatureFusion, GeoRegNet2d +from .submodules import (FPN, Reg2d, homo_warping, init_inverse_range, + schedule_inverse_range) + + +class GeoMVSNet(nn.Module): + + def __init__(self, levels, hypo_plane_num_stages, + depth_interal_ratio_stages, feat_base_channel, + reg_base_channel, group_cor_dim_stages): + super(GeoMVSNet, self).__init__() + + self.levels = levels + self.hypo_plane_num_stages = hypo_plane_num_stages + self.depth_interal_ratio_stages = depth_interal_ratio_stages + + self.StageNet = StageNet() + + # feature settings + self.FeatureNet = FPN(base_channels=feat_base_channel) + self.coarest_separate_flag = True + if self.coarest_separate_flag: + self.CoarestFeatureNet = FPN(base_channels=feat_base_channel) + self.GeoFeatureFusionNet = GeoFeatureFusion( + convolutional_layer_encoding='z', + mask_type='basic', + add_origin_feat_flag=True) + + # cost regularization settings + self.RegNet_stages = nn.ModuleList() + self.group_cor_dim_stages = group_cor_dim_stages + self.geo_reg_flag = True + self.geo_reg_encodings = ['std', 'z', 'z', + 'z'] # must use std in idx-0 + for stage_idx in range(self.levels): + in_dim = group_cor_dim_stages[stage_idx] + if self.geo_reg_flag: + self.RegNet_stages.append( + GeoRegNet2d( + input_channel=in_dim, + base_channel=reg_base_channel, + convolutional_layer_encoding=self. + geo_reg_encodings[stage_idx])) + else: + self.RegNet_stages.append( + Reg2d(input_channel=in_dim, base_channel=reg_base_channel)) + + # frequency domain filter settings + self.curriculum_learning_rho_ratios = [9, 4, 2, 1] + + def forward(self, + imgs, + proj_matrices, + intrinsics_matrices, + depth_values, + filename=None): + + features = [] + if self.coarest_separate_flag: + coarsest_features = [] + for nview_idx in range(len(imgs)): + img = imgs[nview_idx] + features.append(self.FeatureNet(img)) # B C H W + if self.coarest_separate_flag: + coarsest_features.append(self.CoarestFeatureNet(img)) + + # coarse-to-fine + outputs = {} + for stage_idx in range(self.levels): + stage_name = 'stage{}'.format(stage_idx + 1) + B, C, H, W = features[0][stage_name].shape + proj_matrices_stage = proj_matrices[stage_name] + intrinsics_matrices_stage = intrinsics_matrices[stage_name] + + # @Note features + if stage_idx == 0: + if self.coarest_separate_flag: + features_stage = [ + feat[stage_name] for feat in coarsest_features + ] + else: + features_stage = [feat[stage_name] for feat in features] + elif stage_idx >= 1: + features_stage = [feat[stage_name] for feat in features] + + ref_img_stage = F.interpolate( + imgs[0], + size=None, + scale_factor=1. / 2**(3 - stage_idx), + mode='bilinear', + align_corners=False) + depth_last = F.interpolate( + depth_last.unsqueeze(1), + size=None, + scale_factor=2, + mode='bilinear', + align_corners=False) + confidence_last = F.interpolate( + confidence_last.unsqueeze(1), + size=None, + scale_factor=2, + mode='bilinear', + align_corners=False) + + # reference feature + features_stage[0] = self.GeoFeatureFusionNet( + ref_img_stage, depth_last, confidence_last, depth_values, + stage_idx, features_stage[0], intrinsics_matrices_stage) + + # @Note depth hypos + if stage_idx == 0: + depth_hypo = init_inverse_range( + depth_values, self.hypo_plane_num_stages[stage_idx], + img[0].device, img[0].dtype, H, W) + else: + inverse_min_depth, inverse_max_depth = outputs_stage[ + 'inverse_min_depth'].detach(), \ + outputs_stage['inverse_max_depth'].detach() + depth_hypo = schedule_inverse_range( + inverse_min_depth, inverse_max_depth, + self.hypo_plane_num_stages[stage_idx], H, W) # B D H W + + # @Note cost regularization + geo_reg_data = {} + if self.geo_reg_flag: + geo_reg_data['depth_values'] = depth_values + if stage_idx >= 1 and self.geo_reg_encodings[stage_idx] == 'z': + prob_volume_last = F.interpolate( + prob_volume_last, + size=None, + scale_factor=2, + mode='bilinear', + align_corners=False) + geo_reg_data['prob_volume_last'] = prob_volume_last + + outputs_stage = self.StageNet( + stage_idx, + features_stage, + proj_matrices_stage, + depth_hypo=depth_hypo, + regnet=self.RegNet_stages[stage_idx], + group_cor_dim=self.group_cor_dim_stages[stage_idx], + depth_interal_ratio=self.depth_interal_ratio_stages[stage_idx], + geo_reg_data=geo_reg_data) + + # @Note frequency domain filter + depth_est = outputs_stage['depth'] + depth_est_filtered = frequency_domain_filter( + depth_est, + rho_ratio=self.curriculum_learning_rho_ratios[stage_idx]) + outputs_stage['depth_filtered'] = depth_est_filtered + depth_last = depth_est_filtered + + confidence_last = outputs_stage['photometric_confidence'] + prob_volume_last = outputs_stage['prob_volume'] + + outputs[stage_name] = outputs_stage + outputs.update(outputs_stage) + + return outputs + + +class StageNet(nn.Module): + + def __init__(self, attn_temp=2): + super(StageNet, self).__init__() + self.attn_temp = attn_temp + + def forward(self, + stage_idx, + features, + proj_matrices, + depth_hypo, + regnet, + group_cor_dim, + depth_interal_ratio, + geo_reg_data=None): + + # @Note step1: feature extraction + proj_matrices = torch.unbind(proj_matrices, 1) + ref_feature, src_features = features[0], features[1:] + ref_proj, src_projs = proj_matrices[0], proj_matrices[1:] + B, D, H, W = depth_hypo.shape + C = ref_feature.shape[1] + + # @Note step2: cost aggregation + ref_volume = ref_feature.unsqueeze(2).repeat(1, 1, D, 1, 1) + cor_weight_sum = 1e-8 + cor_feats = 0 + for src_idx, (src_fea, + src_proj) in enumerate(zip(src_features, src_projs)): + # save_fn = None + src_proj_new = src_proj[:, 0].clone() + src_proj_new[:, :3, :4] = torch.matmul(src_proj[:, 1, :3, :3], + src_proj[:, 0, :3, :4]) + ref_proj_new = ref_proj[:, 0].clone() + ref_proj_new[:, :3, :4] = torch.matmul(ref_proj[:, 1, :3, :3], + ref_proj[:, 0, :3, :4]) + warped_src = homo_warping(src_fea, src_proj_new, ref_proj_new, + depth_hypo) # B C D H W + + warped_src = warped_src.reshape(B, group_cor_dim, + C // group_cor_dim, D, H, W) + ref_volume = ref_volume.reshape(B, group_cor_dim, + C // group_cor_dim, D, H, W) + cor_feat = (warped_src * ref_volume).mean(2) # B G D H W + del warped_src, src_proj, src_fea + + cor_weight = torch.softmax(cor_feat.sum(1) / self.attn_temp, + 1) / math.sqrt(C) # B D H W + cor_weight_sum += cor_weight # B D H W + cor_feats += cor_weight.unsqueeze(1) * cor_feat # B C D H W + del cor_weight, cor_feat + + cost_volume = cor_feats / cor_weight_sum.unsqueeze(1) # B C D H W + del cor_weight_sum, src_features + + # @Note step3: cost regularization + if geo_reg_data == {}: + # basic + cost_reg = regnet(cost_volume) + else: + # probability volume geometry embedding + cost_reg = regnet(cost_volume, stage_idx, geo_reg_data) + del cost_volume + prob_volume = F.softmax(cost_reg, dim=1) # B D H W + + # @Note step4: depth regression + prob_max_indices = prob_volume.max(1, keepdim=True)[1] # B 1 H W + depth = torch.gather(depth_hypo, 1, + prob_max_indices).squeeze(1) # B H W + + with torch.no_grad(): + photometric_confidence = prob_volume.max(1)[0] # B H W + photometric_confidence = F.interpolate( + photometric_confidence.unsqueeze(1), + scale_factor=1, + mode='bilinear', + align_corners=True).squeeze(1) + + last_depth_itv = 1. / depth_hypo[:, 2, :, :] - 1. / depth_hypo[:, + 1, :, :] + inverse_min_depth = 1 / depth + depth_interal_ratio * last_depth_itv # B H W + inverse_max_depth = 1 / depth - depth_interal_ratio * last_depth_itv # B H W + + output_stage = { + 'depth': depth, + 'photometric_confidence': photometric_confidence, + 'depth_hypo': depth_hypo, + 'prob_volume': prob_volume, + 'inverse_min_depth': inverse_min_depth, + 'inverse_max_depth': inverse_max_depth, + } + return output_stage diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/loss.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/loss.py new file mode 100644 index 000000000..f2c811fb4 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/loss.py @@ -0,0 +1,120 @@ +# @Description: Loss Functions (Sec 3.4 in the paper). +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import torch + + +def geomvsnet_loss(inputs, depth_gt_ms, mask_ms, **kwargs): + + stage_lw = kwargs.get('stage_lw', [1, 1, 1, 1]) + depth_values = kwargs.get('depth_values') + depth_min, depth_max = depth_values[:, 0], depth_values[:, -1] + + total_loss = torch.tensor( + 0.0, + dtype=torch.float32, + device=mask_ms['stage1'].device, + requires_grad=False) + pw_loss_stages = [] + dds_loss_stages = [] + for stage_idx, (stage_inputs, stage_key) in enumerate([ + (inputs[k], k) for k in inputs.keys() if 'stage' in k + ]): + + depth = stage_inputs['depth_filtered'] + prob_volume = stage_inputs['prob_volume'] + depth_value = stage_inputs['depth_hypo'] + + depth_gt = depth_gt_ms[stage_key] + mask = mask_ms[stage_key] > 0.5 + + # pw loss + pw_loss = pixel_wise_loss(prob_volume, depth_gt, mask, depth_value) + pw_loss_stages.append(pw_loss) + + # dds loss + dds_loss = depth_distribution_similarity_loss(depth, depth_gt, mask, + depth_min, depth_max) + dds_loss_stages.append(dds_loss) + + # total loss + lam1, lam2 = 0.8, 0.2 + total_loss = total_loss + stage_lw[stage_idx] * ( + lam1 * pw_loss + lam2 * dds_loss) + + depth_pred = stage_inputs['depth'] + depth_gt = depth_gt_ms[stage_key] + epe = cal_metrics(depth_pred, depth_gt, mask, depth_min, depth_max) + + return total_loss, epe, pw_loss_stages, dds_loss_stages + + +def pixel_wise_loss(prob_volume, depth_gt, mask, depth_value): + mask_true = mask + valid_pixel_num = torch.sum(mask_true, dim=[1, 2]) + 1e-12 + + shape = depth_gt.shape + + depth_num = depth_value.shape[1] + depth_value_mat = depth_value + + gt_index_image = torch.argmin( + torch.abs(depth_value_mat - depth_gt.unsqueeze(1)), dim=1) + + gt_index_image = torch.mul(mask_true, gt_index_image.type(torch.float)) + gt_index_image = torch.round(gt_index_image).type(torch.long).unsqueeze(1) + + gt_index_volume = torch.zeros(shape[0], depth_num, shape[1], + shape[2]).type(mask_true.type()).scatter_( + 1, gt_index_image, 1) + cross_entropy_image = -torch.sum( + gt_index_volume * torch.log(prob_volume + 1e-12), dim=1).squeeze(1) + masked_cross_entropy_image = torch.mul(mask_true, cross_entropy_image) + masked_cross_entropy = torch.sum(masked_cross_entropy_image, dim=[1, 2]) + + masked_cross_entropy = torch.mean(masked_cross_entropy / valid_pixel_num) + + pw_loss = masked_cross_entropy + return pw_loss + + +def depth_distribution_similarity_loss(depth, depth_gt, mask, depth_min, + depth_max): + depth_norm = depth * 128 / (depth_max - depth_min)[:, None, None] + depth_gt_norm = depth_gt * 128 / (depth_max - depth_min)[:, None, None] + + M_bins = 48 + kl_min = torch.min(torch.min(depth_gt), depth.mean() - 3. * depth.std()) + kl_max = torch.max(torch.max(depth_gt), depth.mean() + 3. * depth.std()) + bins = torch.linspace(kl_min, kl_max, steps=M_bins) + + kl_divs = [] + for i in range(len(bins) - 1): + bin_mask = (depth_gt >= bins[i]) & (depth_gt < bins[i + 1]) + merged_mask = mask & bin_mask + + if merged_mask.sum() > 0: + p = depth_norm[merged_mask] + q = depth_gt_norm[merged_mask] + kl_div = torch.nn.functional.kl_div( + torch.log(p) - torch.log(q), p, reduction='batchmean') + kl_div = torch.log(kl_div) + kl_divs.append(kl_div) + + dds_loss = sum(kl_divs) + return dds_loss + + +def cal_metrics(depth_pred, depth_gt, mask, depth_min, depth_max): + depth_pred_norm = depth_pred * 128 / (depth_max - depth_min)[:, None, None] + depth_gt_norm = depth_gt * 128 / (depth_max - depth_min)[:, None, None] + + abs_err = torch.abs(depth_pred_norm[mask] - depth_gt_norm[mask]) + epe = abs_err.mean() + # err1 = (abs_err <= 1).float().mean() * 100 + # err3 = (abs_err <= 3).float().mean() * 100 + + return epe # err1, err3 diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/submodules.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/submodules.py new file mode 100644 index 000000000..8910ae3b3 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/submodules.py @@ -0,0 +1,379 @@ +# @Description: Some sub-modules for the network. +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class FPN(nn.Module): + """FPN aligncorners downsample 4x""" + + def __init__(self, base_channels, gn=False): + super(FPN, self).__init__() + self.base_channels = base_channels + + self.conv0 = nn.Sequential( + Conv2d(3, base_channels, 3, 1, padding=1, gn=gn), + Conv2d(base_channels, base_channels, 3, 1, padding=1, gn=gn), + ) + + self.conv1 = nn.Sequential( + Conv2d( + base_channels, + base_channels * 2, + 5, + stride=2, + padding=2, + gn=gn), + Conv2d( + base_channels * 2, base_channels * 2, 3, 1, padding=1, gn=gn), + Conv2d( + base_channels * 2, base_channels * 2, 3, 1, padding=1, gn=gn), + ) + + self.conv2 = nn.Sequential( + Conv2d( + base_channels * 2, + base_channels * 4, + 5, + stride=2, + padding=2, + gn=gn), + Conv2d( + base_channels * 4, base_channels * 4, 3, 1, padding=1, gn=gn), + Conv2d( + base_channels * 4, base_channels * 4, 3, 1, padding=1, gn=gn), + ) + + self.conv3 = nn.Sequential( + Conv2d( + base_channels * 4, + base_channels * 8, + 5, + stride=2, + padding=2, + gn=gn), + Conv2d( + base_channels * 8, base_channels * 8, 3, 1, padding=1, gn=gn), + Conv2d( + base_channels * 8, base_channels * 8, 3, 1, padding=1, gn=gn), + ) + + self.out_channels = [8 * base_channels] + final_chs = base_channels * 8 + + self.inner1 = nn.Conv2d(base_channels * 4, final_chs, 1, bias=True) + self.inner2 = nn.Conv2d(base_channels * 2, final_chs, 1, bias=True) + self.inner3 = nn.Conv2d(base_channels * 1, final_chs, 1, bias=True) + + self.out1 = nn.Conv2d(final_chs, base_channels * 8, 1, bias=False) + self.out2 = nn.Conv2d( + final_chs, base_channels * 4, 3, padding=1, bias=False) + self.out3 = nn.Conv2d( + final_chs, base_channels * 2, 3, padding=1, bias=False) + self.out4 = nn.Conv2d( + final_chs, base_channels, 3, padding=1, bias=False) + + self.out_channels.append(base_channels * 4) + self.out_channels.append(base_channels * 2) + self.out_channels.append(base_channels) + + def forward(self, x): + conv0 = self.conv0(x) + conv1 = self.conv1(conv0) + conv2 = self.conv2(conv1) + conv3 = self.conv3(conv2) + + intra_feat = conv3 + outputs = {} + out1 = self.out1(intra_feat) + + intra_feat = F.interpolate( + intra_feat, scale_factor=2, mode='bilinear', + align_corners=True) + self.inner1(conv2) + out2 = self.out2(intra_feat) + + intra_feat = F.interpolate( + intra_feat, scale_factor=2, mode='bilinear', + align_corners=True) + self.inner2(conv1) + out3 = self.out3(intra_feat) + + intra_feat = F.interpolate( + intra_feat, scale_factor=2, mode='bilinear', + align_corners=True) + self.inner3(conv0) + out4 = self.out4(intra_feat) + + outputs['stage1'] = out1 + outputs['stage2'] = out2 + outputs['stage3'] = out3 + outputs['stage4'] = out4 + + return outputs + + +class Reg2d(nn.Module): + + def __init__(self, input_channel=128, base_channel=32): + super(Reg2d, self).__init__() + + self.conv0 = ConvBnReLU3D( + input_channel, base_channel, kernel_size=(1, 3, 3), pad=(0, 1, 1)) + self.conv1 = ConvBnReLU3D( + base_channel, + base_channel * 2, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + pad=(0, 1, 1)) + self.conv2 = ConvBnReLU3D(base_channel * 2, base_channel * 2) + + self.conv3 = ConvBnReLU3D( + base_channel * 2, + base_channel * 4, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + pad=(0, 1, 1)) + self.conv4 = ConvBnReLU3D(base_channel * 4, base_channel * 4) + + self.conv5 = ConvBnReLU3D( + base_channel * 4, + base_channel * 8, + kernel_size=(1, 3, 3), + stride=(1, 2, 2), + pad=(0, 1, 1)) + self.conv6 = ConvBnReLU3D(base_channel * 8, base_channel * 8) + + self.conv7 = nn.Sequential( + nn.ConvTranspose3d( + base_channel * 8, + base_channel * 4, + kernel_size=(1, 3, 3), + padding=(0, 1, 1), + output_padding=(0, 1, 1), + stride=(1, 2, 2), + bias=False), nn.BatchNorm3d(base_channel * 4), + nn.ReLU(inplace=True)) + + self.conv9 = nn.Sequential( + nn.ConvTranspose3d( + base_channel * 4, + base_channel * 2, + kernel_size=(1, 3, 3), + padding=(0, 1, 1), + output_padding=(0, 1, 1), + stride=(1, 2, 2), + bias=False), nn.BatchNorm3d(base_channel * 2), + nn.ReLU(inplace=True)) + + self.conv11 = nn.Sequential( + nn.ConvTranspose3d( + base_channel * 2, + base_channel, + kernel_size=(1, 3, 3), + padding=(0, 1, 1), + output_padding=(0, 1, 1), + stride=(1, 2, 2), + bias=False), nn.BatchNorm3d(base_channel), + nn.ReLU(inplace=True)) + + self.prob = nn.Conv3d(8, 1, 1, stride=1, padding=0) + + def forward(self, x): + conv0 = self.conv0(x) + conv2 = self.conv2(self.conv1(conv0)) + conv4 = self.conv4(self.conv3(conv2)) + x = self.conv6(self.conv5(conv4)) + x = conv4 + self.conv7(x) + x = conv2 + self.conv9(x) + x = conv0 + self.conv11(x) + x = self.prob(x) + + return x.squeeze(1) + + +def homo_warping(src_fea, src_proj, ref_proj, depth_values): + # src_fea: [B, C, H, W] + # src_proj: [B, 4, 4] + # ref_proj: [B, 4, 4] + # depth_values: [B, Ndepth] o [B, Ndepth, H, W] + # out: [B, C, Ndepth, H, W] + C = src_fea.shape[1] + Hs, Ws = src_fea.shape[-2:] + B, num_depth, Hr, Wr = depth_values.shape + + with torch.no_grad(): + proj = torch.matmul(src_proj, torch.inverse(ref_proj)) + rot = proj[:, :3, :3] # [B,3,3] + trans = proj[:, :3, 3:4] # [B,3,1] + + y, x = torch.meshgrid([ + torch.arange(0, Hr, dtype=torch.float32, device=src_fea.device), + torch.arange(0, Wr, dtype=torch.float32, device=src_fea.device) + ]) + y = y.reshape(Hr * Wr) + x = x.reshape(Hr * Wr) + xyz = torch.stack((x, y, torch.ones_like(x))) # [3, H*W] + xyz = torch.unsqueeze(xyz, 0).repeat(B, 1, 1) # [B, 3, H*W] + rot_xyz = torch.matmul(rot, xyz) # [B, 3, H*W] + rot_depth_xyz = rot_xyz.unsqueeze(2).repeat( + 1, 1, num_depth, 1) * depth_values.reshape( + B, 1, num_depth, -1) # [B, 3, Ndepth, H*W] + proj_xyz = rot_depth_xyz + trans.reshape(B, 3, 1, + 1) # [B, 3, Ndepth, H*W] + # FIXME divide 0 + temp = proj_xyz[:, 2:3, :, :] + temp[temp == 0] = 1e-9 + proj_xy = proj_xyz[:, :2, :, :] / temp # [B, 2, Ndepth, H*W] + # proj_xy = proj_xyz[:, :2, :, :] / proj_xyz[:, 2:3, :, :] # [B, 2, Ndepth, H*W] + + proj_x_normalized = proj_xy[:, 0, :, :] / ((Ws - 1) / 2) - 1 + proj_y_normalized = proj_xy[:, 1, :, :] / ((Hs - 1) / 2) - 1 + proj_xy = torch.stack((proj_x_normalized, proj_y_normalized), + dim=3) # [B, Ndepth, H*W, 2] + grid = proj_xy + if len(src_fea.shape) == 4: + warped_src_fea = F.grid_sample( + src_fea, + grid.reshape(B, num_depth * Hr, Wr, 2), + mode='bilinear', + padding_mode='zeros', + align_corners=True) + warped_src_fea = warped_src_fea.reshape(B, C, num_depth, Hr, Wr) + elif len(src_fea.shape) == 5: + warped_src_fea = [] + for d in range(src_fea.shape[2]): + warped_src_fea.append( + F.grid_sample( + src_fea[:, :, d], + grid.reshape(B, num_depth, Hr, Wr, 2)[:, d], + mode='bilinear', + padding_mode='zeros', + align_corners=True)) + warped_src_fea = torch.stack(warped_src_fea, dim=2) + + return warped_src_fea + + +def init_inverse_range(cur_depth, ndepths, device, dtype, H, W): + inverse_depth_min = 1. / cur_depth[:, 0] # (B,) + inverse_depth_max = 1. / cur_depth[:, -1] + itv = torch.arange( + 0, ndepths, device=device, dtype=dtype, requires_grad=False).reshape( + 1, -1, 1, 1).repeat(1, 1, H, W) / (ndepths - 1) # 1 D H W + inverse_depth_hypo = inverse_depth_max[:, None, None, None] + ( + inverse_depth_min - inverse_depth_max)[:, None, None, None] * itv + + return 1. / inverse_depth_hypo + + +def schedule_inverse_range(inverse_min_depth, inverse_max_depth, ndepths, H, + W): + # cur_depth_min, (B, H, W) + # cur_depth_max: (B, H, W) + itv = torch.arange( + 0, + ndepths, + device=inverse_min_depth.device, + dtype=inverse_min_depth.dtype, + requires_grad=False).reshape(1, -1, 1, 1).repeat( + 1, 1, H // 2, W // 2) / (ndepths - 1) # 1 D H W + + inverse_depth_hypo = inverse_max_depth[:, None, :, :] + ( + inverse_min_depth - inverse_max_depth)[:, None, :, :] * itv # B D H W + inverse_depth_hypo = F.interpolate( + inverse_depth_hypo.unsqueeze(1), [ndepths, H, W], + mode='trilinear', + align_corners=True).squeeze(1) + return 1. / inverse_depth_hypo + + +# -------------------------------------------------------------- + + +def init_bn(module): + if module.weight is not None: + nn.init.ones_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + return + + +def init_uniform(module, init_method): + if module.weight is not None: + if init_method == 'kaiming': + nn.init.kaiming_uniform_(module.weight) + elif init_method == 'xavier': + nn.init.xavier_uniform_(module.weight) + return + + +class ConvBnReLU3D(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + pad=1): + super(ConvBnReLU3D, self).__init__() + self.conv = nn.Conv3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=pad, + bias=False) + self.bn = nn.BatchNorm3d(out_channels) + + def forward(self, x): + return F.relu(self.bn(self.conv(x)), inplace=True) + + +class Conv2d(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + relu=True, + bn_momentum=0.1, + init_method='xavier', + gn=False, + group_channel=8, + **kwargs): + super(Conv2d, self).__init__() + bn = not gn + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + bias=(not bn), + **kwargs) + self.kernel_size = kernel_size + self.stride = stride + self.bn = nn.BatchNorm2d( + out_channels, momentum=bn_momentum) if bn else None + self.gn = nn.GroupNorm( + int(max(1, out_channels + / group_channel)), out_channels) if gn else None + self.relu = relu + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + else: + x = self.gn(x) + if self.relu: + x = F.relu(x, inplace=True) + return x + + def init_weights(self, init_method): + init_uniform(self.conv, init_method) + if self.bn is not None: + init_bn(self.bn) diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/__init__.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/__init__.py new file mode 100644 index 000000000..16281fe0b --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/opts.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/opts.py new file mode 100644 index 000000000..e6921f55f --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/opts.py @@ -0,0 +1,148 @@ +# @Description: Options settings & configurations for GeoMVSNet. +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import argparse + + +def get_opts(): + parser = argparse.ArgumentParser(description='args') + + # global settings + parser.add_argument( + '--mode', + default='train', + help='train or test', + choices=['train', 'test', 'val']) + parser.add_argument( + '--which_dataset', + default='dtu', + choices=['dtu', 'tnt', 'blendedmvs', 'general'], + help='which dataset for using') + + parser.add_argument('--n_views', type=int, default=5, help='num of view') + parser.add_argument('--levels', type=int, default=4, help='num of stages') + parser.add_argument( + '--hypo_plane_num_stages', + type=str, + default='8,8,4,4', + help='num of hypothesis planes for each stage') + parser.add_argument( + '--depth_interal_ratio_stages', + type=str, + default='0.5,0.5,0.5,1', + help='depth interals for each stage') + parser.add_argument( + '--feat_base_channel', + type=int, + default=8, + help='channel num for base feature') + parser.add_argument( + '--reg_base_channel', + type=int, + default=8, + help='channel num for regularization') + parser.add_argument( + '--group_cor_dim_stages', + type=str, + default='8,8,4,4', + help='group correlation dim') + + parser.add_argument( + '--batch_size', type=int, default=1, help='batch size for training') + parser.add_argument( + '--data_scale', + type=str, + choices=['mid', 'raw'], + help='use mid or raw resolution') + parser.add_argument('--trainpath', help='data path for training') + parser.add_argument('--testpath', help='data path for testing') + parser.add_argument('--trainlist', help='data list for training') + parser.add_argument('--testlist', nargs='+', help='data list for testing') + + # training config + parser.add_argument( + '--stage_lw', + type=str, + default='1,1,1,1', + help='loss weight for different stages') + + parser.add_argument( + '--epochs', type=int, default=10, help='number of epochs to train') + parser.add_argument( + '--lr_scheduler', + type=str, + default='MS', + help='scheduler for learning rate') + parser.add_argument( + '--lr', type=float, default=0.001, help='learning rate') + parser.add_argument( + '--lrepochs', + type=str, + default='1,3,5,7,9,11,13,15:1.5', + help='epoch ids to downscale lr and the downscale rate') + parser.add_argument('--wd', type=float, default=0.0, help='weight decay') + + parser.add_argument( + '--summary_freq', + type=int, + default=100, + help='print and summary frequency') + parser.add_argument( + '--save_freq', type=int, default=1, help='save checkpoint frequency') + parser.add_argument( + '--eval_freq', type=int, default=1, help='eval frequency') + + parser.add_argument( + '--robust_train', action='store_true', help='robust training') + + # testing config + parser.add_argument( + '--split', + type=str, + choices=['intermediate', 'advanced'], + help='intermediate|advanced for tanksandtemples') + parser.add_argument( + '--img_mode', + type=str, + default='resize', + choices=['resize', 'crop'], + help='image resolution matching strategy for TNT dataset') + parser.add_argument( + '--cam_mode', + type=str, + default='origin', + choices=['origin', 'short_range'], + help='camera parameter strategy for TNT dataset') + + parser.add_argument( + '--loadckpt', default=None, help='load a specific checkpoint') + parser.add_argument( + '--logdir', + default='./checkpoints/debug', + help='the directory to save checkpoints/logs') + parser.add_argument( + '--nolog', action='store_true', help='do not log into .log file') + parser.add_argument( + '--notensorboard', + action='store_true', + help='do not log into tensorboard') + parser.add_argument( + '--save_conf_all_stages', + action='store_true', + help='save confidence maps for all stages') + parser.add_argument('--outdir', default='./outputs', help='output dir') + parser.add_argument( + '--resume', action='store_true', help='continue to train the model') + + # pytorch config + parser.add_argument('--device', default='cuda', help='device to use') + parser.add_argument( + '--seed', type=int, default=1, metavar='S', help='random seed') + parser.add_argument( + '--pin_m', action='store_true', help='data loader pin memory') + parser.add_argument('--local_rank', type=int, default=0) + + return parser.parse_args() diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/utils.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/utils.py new file mode 100644 index 000000000..fe44862c5 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/models/utils/utils.py @@ -0,0 +1,269 @@ +# @Description: Some useful utils. +# @Author: Zhe Zhang (doublez@stu.pku.edu.cn) +# @Affiliation: Peking University (PKU) +# @LastEditDate: 2023-09-07 +# @https://github.com/doublez0108/geomvsnet + +import random +from bisect import bisect_right + +import numpy as np +import torch +import torch.distributed as dist +import torchvision.utils as vutils + + +# torch.no_grad warpper for functions +def make_nograd_func(func): + + def wrapper(*f_args, **f_kwargs): + with torch.no_grad(): + ret = func(*f_args, **f_kwargs) + return ret + + return wrapper + + +# convert a function into recursive style to handle nested dict/list/tuple variables +def make_recursive_func(func): + + def wrapper(vars): + if isinstance(vars, list): + return [wrapper(x) for x in vars] + elif isinstance(vars, tuple): + return tuple([wrapper(x) for x in vars]) + elif isinstance(vars, dict): + return {k: wrapper(v) for k, v in vars.items()} + else: + return func(vars) + + return wrapper + + +@make_recursive_func +def tensor2float(vars): + if isinstance(vars, float): + return vars + elif isinstance(vars, torch.Tensor): + return vars.data.item() + else: + raise NotImplementedError( + 'invalid input type {} for tensor2float'.format(type(vars))) + + +@make_recursive_func +def tensor2numpy(vars): + if isinstance(vars, np.ndarray): + return vars + elif isinstance(vars, torch.Tensor): + return vars.detach().cpu().numpy().copy() + else: + raise NotImplementedError( + 'invalid input type {} for tensor2numpy'.format(type(vars))) + + +@make_recursive_func +def tocuda(vars): + if isinstance(vars, torch.Tensor): + return vars.to(torch.device('cuda')) + elif isinstance(vars, str): + return vars + else: + raise NotImplementedError( + 'invalid input type {} for tensor2numpy'.format(type(vars))) + + +def tb_save_scalars(logger, mode, scalar_dict, global_step): + scalar_dict = tensor2float(scalar_dict) + for key, value in scalar_dict.items(): + if not isinstance(value, (list, tuple)): + name = '{}/{}'.format(mode, key) + logger.add_scalar(name, value, global_step) + else: + for idx in range(len(value)): + name = '{}/{}_{}'.format(mode, key, idx) + logger.add_scalar(name, value[idx], global_step) + + +def tb_save_images(logger, mode, images_dict, global_step): + images_dict = tensor2numpy(images_dict) + + def preprocess(name, img): + if not (len(img.shape) == 3 or len(img.shape) == 4): + raise NotImplementedError( + 'invalid img shape {}:{} in save_images'.format( + name, img.shape)) + if len(img.shape) == 3: + img = img[:, np.newaxis, :, :] + img = torch.from_numpy(img[:1]) + return vutils.make_grid( + img, padding=0, nrow=1, normalize=True, scale_each=True) + + for key, value in images_dict.items(): + if not isinstance(value, (list, tuple)): + name = '{}/{}'.format(mode, key) + logger.add_image(name, preprocess(name, value), global_step) + else: + for idx in range(len(value)): + name = '{}/{}_{}'.format(mode, key, idx) + logger.add_image(name, preprocess(name, value[idx]), + global_step) + + +class DictAverageMeter(object): + + def __init__(self): + self.data = {} + self.count = 0 + + def update(self, new_input): + self.count += 1 + if len(self.data) == 0: + for k, v in new_input.items(): + if not isinstance(v, float): + raise NotImplementedError('invalid data {}: {}'.format( + k, type(v))) + self.data[k] = v + else: + for k, v in new_input.items(): + if not isinstance(v, float): + raise NotImplementedError('invalid data {}: {}'.format( + k, type(v))) + self.data[k] += v + + def mean(self): + return {k: v / self.count for k, v in self.data.items()} + + +# a wrapper to compute metrics for each image individually +def compute_metrics_for_each_image(metric_func): + + def wrapper(depth_est, depth_gt, mask, *args): + batch_size = depth_gt.shape[0] + results = [] + # compute result one by one + for idx in range(batch_size): + ret = metric_func(depth_est[idx], depth_gt[idx], mask[idx], *args) + results.append(ret) + return torch.stack(results).mean() + + return wrapper + + +@make_nograd_func +@compute_metrics_for_each_image +def Thres_metrics(depth_est, depth_gt, mask, thres): + assert isinstance(thres, (int, float)) + depth_est, depth_gt = depth_est[mask], depth_gt[mask] + errors = torch.abs(depth_est - depth_gt) + err_mask = errors > thres + return torch.mean(err_mask.float()) + + +# NOTE: please do not use this to build up training loss +@make_nograd_func +@compute_metrics_for_each_image +def AbsDepthError_metrics(depth_est, depth_gt, mask, thres=None): + depth_est, depth_gt = depth_est[mask], depth_gt[mask] + error = (depth_est - depth_gt).abs() + if thres is not None: + error = error[(error >= float(thres[0])) & (error <= float(thres[1]))] + if error.shape[0] == 0: + return torch.tensor(0, device=error.device, dtype=error.dtype) + return torch.mean(error) + + +def synchronize(): + """ + Helper function to synchronize (barrier) among all processes when + using distributed training + """ + if not dist.is_available(): + return + if not dist.is_initialized(): + return + world_size = dist.get_world_size() + if world_size == 1: + return + dist.barrier() + + +def get_world_size(): + if not dist.is_available(): + return 1 + if not dist.is_initialized(): + return 1 + return dist.get_world_size() + + +def reduce_scalar_outputs(scalar_outputs): + world_size = get_world_size() + if world_size < 2: + return scalar_outputs + with torch.no_grad(): + names = [] + scalars = [] + for k in sorted(scalar_outputs.keys()): + names.append(k) + scalars.append(scalar_outputs[k]) + scalars = torch.stack(scalars, dim=0) + dist.reduce(scalars, dst=0) + if dist.get_rank() == 0: + # only main process gets accumulated, so only divide by + # world_size in this case + scalars /= world_size + reduced_scalars = {k: v for k, v in zip(names, scalars)} + + return reduced_scalars + + +class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): + + def __init__( + self, + optimizer, + milestones, + gamma=0.1, + warmup_factor=1.0 / 3, + warmup_iters=500, + warmup_method='linear', + last_epoch=-1, + ): + if not list(milestones) == sorted(milestones): + raise ValueError( + 'Milestones should be a list of' + ' increasing integers. Got {}', + milestones, + ) + + if warmup_method not in ('constant', 'linear'): + raise ValueError( + "Only 'constant' or 'linear' warmup_method accepted" + 'got {}'.format(warmup_method)) + self.milestones = milestones + self.gamma = gamma + self.warmup_factor = warmup_factor + self.warmup_iters = warmup_iters + self.warmup_method = warmup_method + super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) + + def get_lr(self): + warmup_factor = 1 + if self.last_epoch < self.warmup_iters: + if self.warmup_method == 'constant': + warmup_factor = self.warmup_factor + elif self.warmup_method == 'linear': + alpha = float(self.last_epoch) / self.warmup_iters + warmup_factor = self.warmup_factor * (1 - alpha) + alpha + return [ + base_lr * warmup_factor + * self.gamma**bisect_right(self.milestones, self.last_epoch) + for base_lr in self.base_lrs + ] + + +def set_random_seed(seed): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/module.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/module.py new file mode 100644 index 000000000..2ffda232c --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/module.py @@ -0,0 +1,678 @@ +# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def init_bn(module): + if module.weight is not None: + nn.init.ones_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + return + + +def init_uniform(module, init_method): + if module.weight is not None: + if init_method == 'kaiming': + nn.init.kaiming_uniform_(module.weight) + elif init_method == 'xavier': + nn.init.xavier_uniform_(module.weight) + return + + +class Conv2d(nn.Module): + """Applies a 2D convolution (optionally with batch normalization and relu activation) + over an input signal composed of several input planes. + + Attributes: + conv (nn.Module): convolution module + bn (nn.Module): batch normalization module + relu (bool): whether to activate by relu + + Notes: + Default momentum for batch normalization is set to be 0.01, + + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + relu=True, + bn=True, + bn_momentum=0.1, + init_method='xavier', + **kwargs): + super(Conv2d, self).__init__() + + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + bias=(not bn), + **kwargs) + self.kernel_size = kernel_size + self.stride = stride + self.bn = nn.BatchNorm2d( + out_channels, momentum=bn_momentum) if bn else None + self.relu = relu + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + if self.relu: + x = F.relu(x, inplace=True) + return x + + def init_weights(self, init_method): + """default initialization""" + init_uniform(self.conv, init_method) + if self.bn is not None: + init_bn(self.bn) + + +class Deconv2d(nn.Module): + """Applies a 2D deconvolution (optionally with batch normalization and relu activation) + over an input signal composed of several input planes. + + Attributes: + conv (nn.Module): convolution module + bn (nn.Module): batch normalization module + relu (bool): whether to activate by relu + + Notes: + Default momentum for batch normalization is set to be 0.01, + + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + relu=True, + bn=True, + bn_momentum=0.1, + init_method='xavier', + **kwargs): + super(Deconv2d, self).__init__() + self.out_channels = out_channels + assert stride in [1, 2] + self.stride = stride + + self.conv = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + bias=(not bn), + **kwargs) + self.bn = nn.BatchNorm2d( + out_channels, momentum=bn_momentum) if bn else None + self.relu = relu + + def forward(self, x): + y = self.conv(x) + if self.stride == 2: + h, w = list(x.size())[2:] + y = y[:, :, :2 * h, :2 * w].contiguous() + if self.bn is not None: + x = self.bn(y) + if self.relu: + x = F.relu(x, inplace=True) + return x + + def init_weights(self, init_method): + """default initialization""" + init_uniform(self.conv, init_method) + if self.bn is not None: + init_bn(self.bn) + + +class Conv3d(nn.Module): + """Applies a 3D convolution (optionally with batch normalization and relu activation) + over an input signal composed of several input planes. + + Attributes: + conv (nn.Module): convolution module + bn (nn.Module): batch normalization module + relu (bool): whether to activate by relu + + Notes: + Default momentum for batch normalization is set to be 0.01, + + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + relu=True, + bn=True, + bn_momentum=0.1, + init_method='xavier', + **kwargs): + super(Conv3d, self).__init__() + self.out_channels = out_channels + self.kernel_size = kernel_size + assert stride in [1, 2] + self.stride = stride + + self.conv = nn.Conv3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + bias=(not bn), + **kwargs) + self.bn = nn.BatchNorm3d( + out_channels, momentum=bn_momentum) if bn else None + self.relu = relu + + def forward(self, x): + x = self.conv(x) + if self.bn is not None: + x = self.bn(x) + if self.relu: + x = F.relu(x, inplace=True) + return x + + def init_weights(self, init_method): + """default initialization""" + init_uniform(self.conv, init_method) + if self.bn is not None: + init_bn(self.bn) + + +class Deconv3d(nn.Module): + """Applies a 3D deconvolution (optionally with batch normalization and relu activation) + over an input signal composed of several input planes. + + Attributes: + conv (nn.Module): convolution module + bn (nn.Module): batch normalization module + relu (bool): whether to activate by relu + + Notes: + Default momentum for batch normalization is set to be 0.01, + + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + relu=True, + bn=True, + bn_momentum=0.1, + init_method='xavier', + **kwargs): + super(Deconv3d, self).__init__() + self.out_channels = out_channels + assert stride in [1, 2] + self.stride = stride + + self.conv = nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size, + stride=stride, + bias=(not bn), + **kwargs) + self.bn = nn.BatchNorm3d( + out_channels, momentum=bn_momentum) if bn else None + self.relu = relu + + def forward(self, x): + y = self.conv(x) + if self.bn is not None: + x = self.bn(y) + if self.relu: + x = F.relu(x, inplace=True) + return x + + def init_weights(self, init_method): + """default initialization""" + init_uniform(self.conv, init_method) + if self.bn is not None: + init_bn(self.bn) + + +class ConvBnReLU(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + pad=1): + super(ConvBnReLU, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=pad, + bias=False) + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + return F.relu(self.bn(self.conv(x)), inplace=True) + + +class ConvBn(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + pad=1): + super(ConvBn, self).__init__() + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=pad, + bias=False) + self.bn = nn.BatchNorm2d(out_channels) + + def forward(self, x): + return self.bn(self.conv(x)) + + +def homo_warping(src_fea, src_proj, ref_proj, depth_values): + """ + src_fea: [B, C, H, W] + src_proj: [B, 4, 4] + ref_proj: [B, 4, 4] + depth_values: [B, Ndepth] o [B, Ndepth, H, W] + out: [B, C, Ndepth, H, W] + """ + batch, channels = src_fea.shape[0], src_fea.shape[1] + num_depth = depth_values.shape[1] + height, width = src_fea.shape[2], src_fea.shape[3] + + with torch.no_grad(): + proj = torch.matmul(src_proj, torch.inverse(ref_proj)) + rot = proj[:, :3, :3] # [B,3,3] + trans = proj[:, :3, 3:4] # [B,3,1] + + y, x = torch.meshgrid([ + torch.arange( + 0, height, dtype=torch.float32, device=src_fea.device), + torch.arange(0, width, dtype=torch.float32, device=src_fea.device) + ]) + y, x = y.contiguous(), x.contiguous() + y, x = y.view(height * width), x.view(height * width) + xyz = torch.stack((x, y, torch.ones_like(x))) # [3, H*W] + xyz = torch.unsqueeze(xyz, 0).repeat(batch, 1, 1) # [B, 3, H*W] + rot_xyz = torch.matmul(rot, xyz) # [B, 3, H*W] + rot_depth_xyz = rot_xyz.unsqueeze(2).repeat( + 1, 1, num_depth, 1) * depth_values.view(batch, 1, num_depth, + -1) # [B, 3, Ndepth, H*W] + proj_xyz = rot_depth_xyz + trans.view(batch, 3, 1, + 1) # [B, 3, Ndepth, H*W] + proj_xy = proj_xyz[:, : + 2, :, :] / proj_xyz[:, 2: + 3, :, :] # [B, 2, Ndepth, H*W] + proj_x_normalized = proj_xy[:, 0, :, :] / ((width - 1) / 2) - 1 + proj_y_normalized = proj_xy[:, 1, :, :] / ((height - 1) / 2) - 1 + proj_xy = torch.stack((proj_x_normalized, proj_y_normalized), + dim=3) # [B, Ndepth, H*W, 2] + grid = proj_xy + + warped_src_fea = F.grid_sample( + src_fea, + grid.view(batch, num_depth * height, width, 2), + mode='bilinear', + padding_mode='zeros', + align_corners=True) + warped_src_fea = warped_src_fea.view(batch, channels, num_depth, height, + width) + + return warped_src_fea + + +class DeConv2dFuse(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + relu=True, + bn=True, + bn_momentum=0.1): + super(DeConv2dFuse, self).__init__() + + self.deconv = Deconv2d( + in_channels, + out_channels, + kernel_size, + stride=2, + padding=1, + output_padding=1, + bn=True, + relu=relu, + bn_momentum=bn_momentum) + + self.conv = Conv2d( + 2 * out_channels, + out_channels, + kernel_size, + stride=1, + padding=1, + bn=bn, + relu=relu, + bn_momentum=bn_momentum) + + def forward(self, x_pre, x): + x = self.deconv(x) + x = torch.cat((x, x_pre), dim=1) + x = self.conv(x) + return x + + +class FeatureNet(nn.Module): + + def __init__(self, base_channels, num_stage=3, stride=4, arch_mode='unet'): + super(FeatureNet, self).__init__() + assert arch_mode in [ + 'unet', 'fpn' + ], f"mode must be in 'unet' or 'fpn', but get:{arch_mode}" + self.arch_mode = arch_mode + self.stride = stride + self.base_channels = base_channels + self.num_stage = num_stage + + self.conv0 = nn.Sequential( + Conv2d(3, base_channels, 3, 1, padding=1), + Conv2d(base_channels, base_channels, 3, 1, padding=1), + ) + + self.conv1 = nn.Sequential( + Conv2d(base_channels, base_channels * 2, 5, stride=2, padding=2), + Conv2d(base_channels * 2, base_channels * 2, 3, 1, padding=1), + Conv2d(base_channels * 2, base_channels * 2, 3, 1, padding=1), + ) + + self.conv2 = nn.Sequential( + Conv2d( + base_channels * 2, base_channels * 4, 5, stride=2, padding=2), + Conv2d(base_channels * 4, base_channels * 4, 3, 1, padding=1), + Conv2d(base_channels * 4, base_channels * 4, 3, 1, padding=1), + ) + + self.out1 = nn.Conv2d( + base_channels * 4, base_channels * 4, 1, bias=False) + self.out_channels = [4 * base_channels] + + if self.arch_mode == 'unet': + if num_stage == 3: + self.deconv1 = DeConv2dFuse(base_channels * 4, + base_channels * 2, 3) + self.deconv2 = DeConv2dFuse(base_channels * 2, base_channels, + 3) + + self.out2 = nn.Conv2d( + base_channels * 2, base_channels * 2, 1, bias=False) + self.out3 = nn.Conv2d( + base_channels, base_channels, 1, bias=False) + self.out_channels.append(2 * base_channels) + self.out_channels.append(base_channels) + + elif num_stage == 2: + self.deconv1 = DeConv2dFuse(base_channels * 4, + base_channels * 2, 3) + + self.out2 = nn.Conv2d( + base_channels * 2, base_channels * 2, 1, bias=False) + self.out_channels.append(2 * base_channels) + elif self.arch_mode == 'fpn': + final_chs = base_channels * 4 + if num_stage == 3: + self.inner1 = nn.Conv2d( + base_channels * 2, final_chs, 1, bias=True) + self.inner2 = nn.Conv2d( + base_channels * 1, final_chs, 1, bias=True) + + self.out2 = nn.Conv2d( + final_chs, base_channels * 2, 3, padding=1, bias=False) + self.out3 = nn.Conv2d( + final_chs, base_channels, 3, padding=1, bias=False) + self.out_channels.append(base_channels * 2) + self.out_channels.append(base_channels) + + elif num_stage == 2: + self.inner1 = nn.Conv2d( + base_channels * 2, final_chs, 1, bias=True) + + self.out2 = nn.Conv2d( + final_chs, base_channels, 3, padding=1, bias=False) + self.out_channels.append(base_channels) + + def forward(self, x): + conv0 = self.conv0(x) + conv1 = self.conv1(conv0) + conv2 = self.conv2(conv1) + + intra_feat = conv2 + outputs = {} + out = self.out1(intra_feat) + outputs['stage1'] = out + if self.arch_mode == 'unet': + if self.num_stage == 3: + intra_feat = self.deconv1(conv1, intra_feat) + out = self.out2(intra_feat) + outputs['stage2'] = out + + intra_feat = self.deconv2(conv0, intra_feat) + out = self.out3(intra_feat) + outputs['stage3'] = out + + elif self.num_stage == 2: + intra_feat = self.deconv1(conv1, intra_feat) + out = self.out2(intra_feat) + outputs['stage2'] = out + + elif self.arch_mode == 'fpn': + if self.num_stage == 3: + intra_feat = F.interpolate( + intra_feat, scale_factor=2, + mode='nearest') + self.inner1(conv1) + out = self.out2(intra_feat) + outputs['stage2'] = out + + intra_feat = F.interpolate( + intra_feat, scale_factor=2, + mode='nearest') + self.inner2(conv0) + out = self.out3(intra_feat) + outputs['stage3'] = out + + elif self.num_stage == 2: + intra_feat = F.interpolate( + intra_feat, scale_factor=2, + mode='nearest') + self.inner1(conv1) + out = self.out2(intra_feat) + outputs['stage2'] = out + + return outputs + + +class CostRegNet(nn.Module): + + def __init__(self, in_channels, base_channels): + super(CostRegNet, self).__init__() + self.conv0 = Conv3d(in_channels, base_channels, padding=1) + + self.conv1 = Conv3d( + base_channels, base_channels * 2, stride=2, padding=1) + self.conv2 = Conv3d(base_channels * 2, base_channels * 2, padding=1) + + self.conv3 = Conv3d( + base_channels * 2, base_channels * 4, stride=2, padding=1) + self.conv4 = Conv3d(base_channels * 4, base_channels * 4, padding=1) + + self.conv5 = Conv3d( + base_channels * 4, base_channels * 8, stride=2, padding=1) + self.conv6 = Conv3d(base_channels * 8, base_channels * 8, padding=1) + + self.conv7 = Deconv3d( + base_channels * 8, + base_channels * 4, + stride=2, + padding=1, + output_padding=1) + + self.conv9 = Deconv3d( + base_channels * 4, + base_channels * 2, + stride=2, + padding=1, + output_padding=1) + + self.conv11 = Deconv3d( + base_channels * 2, + base_channels * 1, + stride=2, + padding=1, + output_padding=1) + + self.prob = nn.Conv3d( + base_channels, 1, 3, stride=1, padding=1, bias=False) + + def forward(self, x): + conv0 = self.conv0(x) + conv2 = self.conv2(self.conv1(conv0)) + conv4 = self.conv4(self.conv3(conv2)) + x = self.conv6(self.conv5(conv4)) + x = conv4 + self.conv7(x) + x = conv2 + self.conv9(x) + x = conv0 + self.conv11(x) + x = self.prob(x) + return x + + +class RefineNet(nn.Module): + + def __init__(self): + super(RefineNet, self).__init__() + self.conv1 = ConvBnReLU(4, 32) + self.conv2 = ConvBnReLU(32, 32) + self.conv3 = ConvBnReLU(32, 32) + self.res = ConvBnReLU(32, 1) + + def forward(self, img, depth_init): + concat = F.cat((img, depth_init), dim=1) + depth_residual = self.res(self.conv3(self.conv2(self.conv1(concat)))) + depth_refined = depth_init + depth_residual + return depth_refined + + +def depth_regression(p, depth_values): + if depth_values.dim() <= 2: + depth_values = depth_values.view(*depth_values.shape, 1, 1) + depth = torch.sum(p * depth_values, 1) + + return depth + + +def cas_mvsnet_loss(inputs, depth_gt_ms, mask_ms, **kwargs): + depth_loss_weights = kwargs.get('dlossw', None) + + total_loss = torch.tensor( + 0.0, + dtype=torch.float32, + device=mask_ms['stage1'].device, + requires_grad=False) + + for (stage_inputs, stage_key) in [(inputs[k], k) for k in inputs.keys() + if 'stage' in k]: + depth_est = stage_inputs['depth'] + depth_gt = depth_gt_ms[stage_key] + mask = mask_ms[stage_key] + mask = mask > 0.5 + + depth_loss = F.smooth_l1_loss( + depth_est[mask], depth_gt[mask], reduction='mean') + + if depth_loss_weights is not None: + stage_idx = int(stage_key.replace('stage', '')) - 1 + total_loss += depth_loss_weights[stage_idx] * depth_loss + else: + total_loss += 1.0 * depth_loss + + return total_loss, depth_loss + + +def get_cur_depth_range_samples(cur_depth, + ndepth, + depth_inteval_pixel, + shape, + max_depth=192.0, + min_depth=0.0): + """ + shape, (B, H, W) + cur_depth: (B, H, W) + return depth_range_values: (B, D, H, W) + """ + cur_depth_min = (cur_depth - ndepth / 2 * depth_inteval_pixel) # (B, H, W) + cur_depth_max = (cur_depth + ndepth / 2 * depth_inteval_pixel) + + assert cur_depth.shape == torch.Size( + shape), 'cur_depth:{}, input shape:{}'.format(cur_depth.shape, shape) + new_interval = (cur_depth_max - cur_depth_min) / (ndepth - 1) # (B, H, W) + + depth_range_samples = cur_depth_min.unsqueeze(1) + ( + torch.arange( + 0, + ndepth, + device=cur_depth.device, + dtype=cur_depth.dtype, + requires_grad=False).reshape(1, -1, 1, 1) + * new_interval.unsqueeze(1)) + + return depth_range_samples + + +def get_depth_range_samples(cur_depth, + ndepth, + depth_inteval_pixel, + device, + dtype, + shape, + max_depth=192.0, + min_depth=0.0): + """ + shape: (B, H, W) + cur_depth: (B, H, W) or (B, D) + return depth_range_samples: (B, D, H, W) + """ + if cur_depth.dim() == 2: + cur_depth_min = cur_depth[:, 0] # (B,) + cur_depth_max = cur_depth[:, -1] + new_interval = (cur_depth_max - cur_depth_min) / (ndepth - 1) # (B, ) + + depth_range_samples = cur_depth_min.unsqueeze(1) + (torch.arange( + 0, ndepth, device=device, dtype=dtype, + requires_grad=False).reshape(1, -1) * new_interval.unsqueeze(1) + ) # noqa # (B, D) + + depth_range_samples = depth_range_samples.unsqueeze(-1).unsqueeze( + -1).repeat(1, 1, shape[1], shape[2]) # (B, D, H, W) + + else: + + depth_range_samples = get_cur_depth_range_samples( + cur_depth, ndepth, depth_inteval_pixel, shape, max_depth, + min_depth) + + return depth_range_samples diff --git a/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/utils.py b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/utils.py new file mode 100644 index 000000000..aeab02b36 --- /dev/null +++ b/modelscope/models/cv/image_mvs_depth_estimation_geomvsnet/utils.py @@ -0,0 +1,118 @@ +# The implementation here is modified based on https://github.com/xy-guo/MVSNet_pytorch +import random + +import numpy as np +import torch +import torch.nn.functional as F +import torchvision.utils as vutils + + +# convert a function into recursive style to handle nested dict/list/tuple variables +def make_recursive_func(func): + + def wrapper(vars): + if isinstance(vars, list): + return [wrapper(x) for x in vars] + elif isinstance(vars, tuple): + return tuple([wrapper(x) for x in vars]) + elif isinstance(vars, dict): + return {k: wrapper(v) for k, v in vars.items()} + else: + return func(vars) + + return wrapper + + +@make_recursive_func +def tensor2numpy(vars): + if isinstance(vars, np.ndarray): + return vars + elif isinstance(vars, torch.Tensor): + return vars.detach().cpu().numpy().copy() + else: + raise NotImplementedError( + 'invalid input type {} for tensor2numpy'.format(type(vars))) + + +@make_recursive_func +def numpy2torch(vars): + if isinstance(vars, np.ndarray): + return torch.from_numpy(vars) + elif isinstance(vars, torch.Tensor): + return vars + elif isinstance(vars, str): + return vars + else: + raise NotImplementedError( + 'invalid input type {} for numpy2torch'.format(type(vars))) + + +@make_recursive_func +def tocuda(vars): + if isinstance(vars, torch.Tensor): + return vars.to(torch.device('cuda')) + elif isinstance(vars, str): + return vars + else: + raise NotImplementedError( + 'invalid input type {} for tensor2numpy'.format(type(vars))) + + +def generate_pointcloud(rgb, depth, ply_file, intr, scale=1.0): + """ + Generate a colored point cloud in PLY format from a color and a depth image. + + Input: + rgb_file -- filename of color image + depth_file -- filename of depth image + ply_file -- filename of ply file + + """ + fx, fy, cx, cy = intr[0, 0], intr[1, 1], intr[0, 2], intr[1, 2] + points = [] + for v in range(rgb.shape[0]): + for u in range(rgb.shape[1]): + color = rgb[v, u] # rgb.getpixel((u, v)) + Z = depth[v, u] / scale + if Z == 0: + continue + X = (u - cx) * Z / fx + Y = (v - cy) * Z / fy + points.append('%f %f %f %d %d %d 0\n' % + (X, Y, Z, color[0], color[1], color[2])) + file = open(ply_file, 'w') + file.write('''ply + format ascii 1.0 + element vertex %d + property float x + property float y + property float z + property uchar red + property uchar green + property uchar blue + property uchar alpha + end_header + %s + ''' % (len(points), ''.join(points))) + file.close() + + +def write_cam(file, cam): + f = open(file, 'w') + f.write('extrinsic\n') + for i in range(0, 4): + for j in range(0, 4): + f.write(str(cam[0][i][j]) + ' ') + f.write('\n') + f.write('\n') + + f.write('intrinsic\n') + for i in range(0, 3): + for j in range(0, 3): + f.write(str(cam[1][i][j]) + ' ') + f.write('\n') + + f.write('\n' + str(cam[1][3][0]) + ' ' + str(cam[1][3][1]) + ' ' + + str(cam[1][3][2]) + ' ' + str(cam[1][3][3]) + '\n') + + f.close() diff --git a/modelscope/models/cv/image_normal_estimation/__init__.py b/modelscope/models/cv/image_normal_estimation/__init__.py new file mode 100644 index 000000000..9551a3842 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/__init__.py @@ -0,0 +1,22 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .omnidata_model import OmnidataNormalEstimation + +else: + _import_structure = { + 'omnidata_model': ['OmnidataNormalEstimation'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/image_normal_estimation/modules/__init__.py b/modelscope/models/cv/image_normal_estimation/modules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/__init__.py b/modelscope/models/cv/image_normal_estimation/modules/midas/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py new file mode 100644 index 000000000..41564c78f --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/base_model.py @@ -0,0 +1,20 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import torch + + +class BaseModel(torch.nn.Module): + + def load(self, path): + """Load model from file. + + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device('cpu')) + + if 'optimizer' in parameters: + parameters = parameters['model'] + + self.load_state_dict(parameters) diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py new file mode 100644 index 000000000..e0a30733a --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/blocks.py @@ -0,0 +1,395 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import torch +import torch.nn as nn + +from .vit import (_make_pretrained_vitb16_384, _make_pretrained_vitb_rn50_384, + _make_pretrained_vitl16_384, forward_vit) + + +def _make_encoder( + backbone, + features, + use_pretrained, + groups=1, + expand=False, + exportable=True, + hooks=None, + use_vit_only=False, + use_readout='ignore', +): + if backbone == 'vitl16_384': + pretrained = _make_pretrained_vitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, + expand=expand) # ViT-L/16 - 85.0% Top1 (backbone) + elif backbone == 'vitb_rn50_384': + pretrained = _make_pretrained_vitb_rn50_384( + use_pretrained, + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) + scratch = _make_scratch( + [256, 512, 768, 768], features, groups=groups, + expand=expand) # ViT-H/16 - 85.0% Top1 (backbone) + elif backbone == 'vitb16_384': + pretrained = _make_pretrained_vitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, + expand=expand) # ViT-B/16 - 84.6% Top1 (backbone) + elif backbone == 'resnext101_wsl': + pretrained = _make_pretrained_resnext101_wsl(use_pretrained) + scratch = _make_scratch([256, 512, 1024, 2048], + features, + groups=groups, + expand=expand) # efficientnet_lite3 + elif backbone == 'efficientnet_lite3': + pretrained = _make_pretrained_efficientnet_lite3( + use_pretrained, exportable=exportable) + scratch = _make_scratch([32, 48, 136, 384], + features, + groups=groups, + expand=expand) # efficientnet_lite3 + else: + print(f"Backbone '{backbone}' not implemented") + assert False + + return pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + out_shape4 = out_shape + if expand is True: + out_shape1 = out_shape + out_shape2 = out_shape * 2 + out_shape3 = out_shape * 4 + out_shape4 = out_shape * 8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], + out_shape1, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], + out_shape2, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], + out_shape3, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + scratch.layer4_rn = nn.Conv2d( + in_shape[3], + out_shape4, + kernel_size=3, + stride=1, + padding=1, + bias=False, + groups=groups) + + return scratch + + +def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): + efficientnet = torch.hub.load( + 'rwightman/gen-efficientnet-pytorch', + 'tf_efficientnet_lite3', + pretrained=use_pretrained, + exportable=exportable) + return _make_efficientnet_backbone(efficientnet) + + +def _make_efficientnet_backbone(effnet): + pretrained = nn.Module() + + pretrained.layer1 = nn.Sequential(effnet.conv_stem, effnet.bn1, + effnet.act1, *effnet.blocks[0:2]) + pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) + pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) + pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) + + return pretrained + + +def _make_resnet_backbone(resnet): + pretrained = nn.Module() + pretrained.layer1 = nn.Sequential(resnet.conv1, resnet.bn1, resnet.relu, + resnet.maxpool, resnet.layer1) + + pretrained.layer2 = resnet.layer2 + pretrained.layer3 = resnet.layer3 + pretrained.layer4 = resnet.layer4 + + return pretrained + + +def _make_pretrained_resnext101_wsl(use_pretrained): + resnet = torch.hub.load('facebookresearch/WSL-Images', + 'resnext101_32x8d_wsl') + return _make_resnet_backbone(resnet) + + +class Interpolate(nn.Module): + """Interpolation module. + """ + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, + scale_factor=self.scale_factor, + mode=self.mode, + align_corners=self.align_corners) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode='bilinear', align_corners=True) + + return output + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups = 1 + + self.conv1 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=True, + groups=self.groups) + + self.conv2 = nn.Conv2d( + features, + features, + kernel_size=3, + stride=1, + padding=1, + bias=True, + groups=self.groups) + + if self.bn is True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn is True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn is True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + # return out + x + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block. + """ + + def __init__(self, + features, + activation, + deconv=False, + bn=False, + expand=False, + align_corners=True): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups = 1 + + self.expand = expand + out_features = features + if self.expand is True: + out_features = features // 2 + + self.out_conv = nn.Conv2d( + features, + out_features, + kernel_size=1, + stride=1, + padding=0, + bias=True, + groups=1) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners) + + output = self.out_conv(output) + + return output diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py new file mode 100644 index 000000000..af7993278 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/dpt_depth.py @@ -0,0 +1,108 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .base_model import BaseModel +from .blocks import (FeatureFusionBlock, FeatureFusionBlock_custom, + Interpolate, _make_encoder, forward_vit) + + +def _make_fusion_block(features, use_bn): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + ) + + +class DPT(BaseModel): + + def __init__( + self, + head, + features=256, + backbone='vitb_rn50_384', + readout='project', + channels_last=False, + use_bn=False, + ): + + super(DPT, self).__init__() + + self.channels_last = channels_last + + hooks = { + 'vitb_rn50_384': [0, 1, 8, 11], + 'vitb16_384': [2, 5, 8, 11], + 'vitl16_384': [5, 11, 17, 23], + } + + # Instantiate backbone and reassemble blocks + self.pretrained, self.scratch = _make_encoder( + backbone, + features, + False, # Set to true of you want to train from scratch, uses ImageNet weights + groups=1, + expand=False, + exportable=False, + hooks=hooks[backbone], + use_readout=readout, + ) + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn) + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.scratch.output_conv = head + + def forward(self, x): + if self.channels_last is True: + x.contiguous(memory_format=torch.channels_last) + + layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return out + + +class DPTDepthModel(DPT): + + def __init__(self, path=None, non_negative=True, num_channels=1, **kwargs): + features = kwargs['features'] if 'features' in kwargs else 256 + + head = nn.Sequential( + nn.Conv2d( + features, features // 2, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode='bilinear', align_corners=True), + nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, num_channels, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) + + def forward(self, x): + return super().forward(x).squeeze(dim=1) diff --git a/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py new file mode 100644 index 000000000..bb8ba9f31 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/modules/midas/vit.py @@ -0,0 +1,517 @@ +# This implementation is adopted from MiDaS +# made publicly available under the MIT license +# https://github.com/isl-org/MiDaS +import math +import types + +import timm +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Slice(nn.Module): + + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index:] + + +class AddReadout(nn.Module): + + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index:] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential( + nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) + features = torch.cat((x[:, self.start_index:], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +def forward_vit(pretrained, x): + b, c, h, w = x.shape + + _ = pretrained.model.forward_flex(x) + + layer_1 = pretrained.activations['1'] + layer_2 = pretrained.activations['2'] + layer_3 = pretrained.activations['3'] + layer_4 = pretrained.activations['4'] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size([ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ]), + )) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3:len(pretrained.act_postprocess1)]( + layer_1) + layer_2 = pretrained.act_postprocess2[3:len(pretrained.act_postprocess2)]( + layer_2) + layer_3 = pretrained.act_postprocess3[3:len(pretrained.act_postprocess3)]( + layer_3) + layer_4 = pretrained.act_postprocess4[3:len(pretrained.act_postprocess4)]( + layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, :self.start_index], + posemb[0, self.start_index:], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, + -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate( + posemb_grid, size=(gs_h, gs_w), mode='bilinear') + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed(self.pos_embed, h // self.patch_size[1], + w // self.patch_size[0]) + + B = x.shape[0] + + if hasattr(self.patch_embed, 'backbone'): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[ + -1] # last feature if backbone outputs list/tuple of features + + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, 'dist_token', None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + cls_tokens = self.cls_token.expand( + B, -1, -1) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + return x + + +activations = {} + + +def get_activation(name): + + def hook(model, input, output): + activations[name] = output + + return hook + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == 'ignore': + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == 'add': + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == 'project': + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout='ignore', + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook( + get_activation('1')) + pretrained.model.blocks[hooks[1]].register_forward_hook( + get_activation('2')) + pretrained.model.blocks[hooks[2]].register_forward_hook( + get_activation('3')) + pretrained.model.blocks[hooks[3]].register_forward_hook( + get_activation('4')) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, + start_index) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, + pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model) + + return pretrained + + +def _make_pretrained_vitl16_384(pretrained, use_readout='ignore', hooks=None): + model = timm.create_model('vit_large_patch16_384', pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_vitb16_384(pretrained, use_readout='ignore', hooks=None): + model = timm.create_model('vit_base_patch16_384', pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout) + + +def _make_pretrained_deitb16_384(pretrained, use_readout='ignore', hooks=None): + model = timm.create_model( + 'vit_deit_base_patch16_384', pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout) + + +def _make_pretrained_deitb16_distil_384(pretrained, + use_readout='ignore', + hooks=None): + model = timm.create_model( + 'vit_deit_base_distilled_patch16_384', pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout, + start_index=2, + ) + + +def _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=[0, 1, 8, 11], + vit_features=768, + use_vit_only=False, + use_readout='ignore', + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + + if use_vit_only: + pretrained.model.blocks[hooks[0]].register_forward_hook( + get_activation('1')) + pretrained.model.blocks[hooks[1]].register_forward_hook( + get_activation('2')) + else: + pretrained.model.patch_embed.backbone.stages[0].register_forward_hook( + get_activation('1')) + pretrained.model.patch_embed.backbone.stages[1].register_forward_hook( + get_activation('2')) + + pretrained.model.blocks[hooks[2]].register_forward_hook( + get_activation('3')) + pretrained.model.blocks[hooks[3]].register_forward_hook( + get_activation('4')) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, + start_index) + + if use_vit_only: + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + else: + pretrained.act_postprocess1 = nn.Sequential(nn.Identity(), + nn.Identity(), + nn.Identity()) + pretrained.act_postprocess2 = nn.Sequential(nn.Identity(), + nn.Identity(), + nn.Identity()) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, + pretrained.model) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model) + + return pretrained + + +def _make_pretrained_vitb_rn50_384(pretrained, + use_readout='ignore', + hooks=None, + use_vit_only=False): + model = timm.create_model('vit_base_resnet50_384', pretrained=pretrained) + + hooks = [0, 1, 8, 11] if hooks is None else hooks + return _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) diff --git a/modelscope/models/cv/image_normal_estimation/omnidata_model.py b/modelscope/models/cv/image_normal_estimation/omnidata_model.py new file mode 100644 index 000000000..35e89c1c8 --- /dev/null +++ b/modelscope/models/cv/image_normal_estimation/omnidata_model.py @@ -0,0 +1,54 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +# Model: Omnidata: A Scalable Pipeline for Making Multi-Task Mid-Level Vision Datasets from 3D Scans +# Paper link: https://arxiv.org/pdf/2110.04994.pdf +import os.path as osp + +import torch + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.image_normal_estimation.modules.midas.dpt_depth import \ + DPTDepthModel +from modelscope.outputs import OutputKeys +from modelscope.utils.constant import ModelFile, Tasks + + +@MODELS.register_module( + Tasks.image_normal_estimation, + module_name=Models.omnidata_normal_estimation) +class OmnidataNormalEstimation(TorchModel): + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + # build model + self.model = DPTDepthModel( + backbone='vitb_rn50_384', num_channels=3) # DPT Hybrid + # checkpoint = torch.load(pretrained_weights_path, map_location=map_location) + + # load model + model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_FILE) + checkpoint = torch.load(model_path, map_location='cpu') + if 'state_dict' in checkpoint: + state_dict = {} + for k, v in checkpoint['state_dict'].items(): + state_dict[k[6:]] = v + else: + state_dict = checkpoint + self.model.load_state_dict(state_dict) + self.model.eval() + + def forward(self, inputs): + return self.model(inputs['imgs']).clamp(min=0, max=1) + + def postprocess(self, inputs): + normal_result = inputs.flip(1) + results = {OutputKeys.NORMALS: normal_result} + return results + + def inference(self, data): + results = self.forward(data) + + return results diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py index 24451e96c..8bbc80589 100755 --- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py +++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/net.py @@ -1,4 +1,4 @@ -# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License +# The implementation is adopted from Pytorch_Retinaface, made publicly available under the MIT License # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/net.py import time diff --git a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py index 64d959713..8f39db786 100755 --- a/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py +++ b/modelscope/models/cv/image_portrait_enhancement/retinaface/models/retinaface.py @@ -1,4 +1,4 @@ -# The implementation is adopted from Pytorch_Retinaface, made pubicly available under the MIT License +# The implementation is adopted from Pytorch_Retinaface, made publicly available under the MIT License # at https://github.com/biubug6/Pytorch_Retinaface/tree/master/models/retinaface.py from collections import OrderedDict diff --git a/modelscope/models/cv/image_probing_model/backbone.py b/modelscope/models/cv/image_probing_model/backbone.py index 8f3ed5b6f..64fb37b3c 100644 --- a/modelscope/models/cv/image_probing_model/backbone.py +++ b/modelscope/models/cv/image_probing_model/backbone.py @@ -1,5 +1,5 @@ # The implementation is adopted from OpenAI-CLIP, -# made pubicly available under the MIT License at https://github.com/openai/CLIP +# made publicly available under the MIT License at https://github.com/openai/CLIP import math import sys diff --git a/modelscope/models/cv/image_quality_assessment_man/maniqa.py b/modelscope/models/cv/image_quality_assessment_man/maniqa.py index 8c9243096..eb037941b 100644 --- a/modelscope/models/cv/image_quality_assessment_man/maniqa.py +++ b/modelscope/models/cv/image_quality_assessment_man/maniqa.py @@ -1,4 +1,4 @@ -# This implementation is adopted from MANIQA, made pubicly available under the Apache License 2.0 at +# This implementation is adopted from MANIQA, made publicly available under the Apache License 2.0 at # https://github.com/IIGROUP/MANIQA/blob/master/models/maniqa.py import timm diff --git a/modelscope/models/cv/image_quality_assessment_man/swin.py b/modelscope/models/cv/image_quality_assessment_man/swin.py index df58277f2..e77488c04 100644 --- a/modelscope/models/cv/image_quality_assessment_man/swin.py +++ b/modelscope/models/cv/image_quality_assessment_man/swin.py @@ -1,4 +1,4 @@ -# This implementation is adopted form SwinTransformer, made pubicly available under the MIT License at +# This implementation is adopted form SwinTransformer, made publicly available under the MIT License at # https://github.com/microsoft/Swin-Transformer/blob/main/models/swin_transformer.py import collections.abc diff --git a/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py b/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py index e153e5f96..9282005ec 100644 --- a/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py +++ b/modelscope/models/cv/image_quality_assessment_mos/backbones/resnet.py @@ -1,4 +1,4 @@ -# The implementation is adopted from CenseoQoE, made pubicly available under the MIT License at +# The implementation is adopted from CenseoQoE, made publicly available under the MIT License at # https://github.com/Tencent/CenseoQoE import os diff --git a/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py b/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py index fbe40e6ae..f5710bc5a 100644 --- a/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py +++ b/modelscope/models/cv/image_quality_assessment_mos/censeo_ivqa_model.py @@ -1,4 +1,4 @@ -# The implementation is adopted from CenseoQoE, made pubicly available under the MIT License at +# The implementation is adopted from CenseoQoE, made publicly available under the MIT License at # https://github.com/Tencent/CenseoQoE import torch diff --git a/modelscope/models/cv/image_reid_person/pass_model.py b/modelscope/models/cv/image_reid_person/pass_model.py index 3b032949d..87c43340d 100644 --- a/modelscope/models/cv/image_reid_person/pass_model.py +++ b/modelscope/models/cv/image_reid_person/pass_model.py @@ -1,4 +1,4 @@ -# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at +# The implementation is adopted from PASS-reID, made publicly available under the Apache-2.0 License at # https://github.com/CASIA-IVA-Lab/PASS-reID import os diff --git a/modelscope/models/cv/image_reid_person/transreid_model.py b/modelscope/models/cv/image_reid_person/transreid_model.py index 5bceb4685..924c58973 100644 --- a/modelscope/models/cv/image_reid_person/transreid_model.py +++ b/modelscope/models/cv/image_reid_person/transreid_model.py @@ -1,4 +1,4 @@ -# The implementation is adopted from PASS-reID, made pubicly available under the Apache-2.0 License at +# The implementation is adopted from PASS-reID, made publicly available under the Apache-2.0 License at # https://github.com/CASIA-IVA-Lab/PASS-reID import collections.abc as container_abcs diff --git a/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py b/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py index 33de31e6f..414eae89f 100644 --- a/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py +++ b/modelscope/models/cv/image_super_resolution_pasd_v2/unet_2d_blocks.py @@ -17,11 +17,11 @@ import torch import torch.nn.functional as F from diffusers.models.activations import get_activation -from diffusers.models.attention import AdaGroupNorm from diffusers.models.attention_processor import (Attention, AttnAddedKVProcessor, AttnAddedKVProcessor2_0) from diffusers.models.dual_transformer_2d import DualTransformer2DModel +from diffusers.models.normalization import AdaLayerNorm from diffusers.models.resnet import (Downsample2D, FirDownsample2D, FirUpsample2D, KDownsample2D, KUpsample2D, ResnetBlock2D, Upsample2D) diff --git a/modelscope/models/cv/image_to_3d/__init__.py b/modelscope/models/cv/image_to_3d/__init__.py new file mode 100644 index 000000000..44c424281 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/__init__.py @@ -0,0 +1,2 @@ +# Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. +from . import ldm diff --git a/modelscope/models/cv/image_to_3d/ldm/base_utils.py b/modelscope/models/cv/image_to_3d/ldm/base_utils.py new file mode 100644 index 000000000..3362fa18f --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/base_utils.py @@ -0,0 +1,211 @@ +import pickle + +import cv2 +import numpy as np +from skimage.io import imread + + +def save_pickle(data, pkl_path): + # os.system('mkdir -p {}'.format(os.path.dirname(pkl_path))) + with open(pkl_path, 'wb') as f: + pickle.dump(data, f) + + +def read_pickle(pkl_path): + with open(pkl_path, 'rb') as f: + return pickle.load(f) + + +def draw_epipolar_line(F, img0, img1, pt0, color): + h1, w1 = img1.shape[:2] + hpt = np.asarray([pt0[0], pt0[1], 1], dtype=np.float32)[:, None] + ln = F @ hpt + ln = ln[:, 0] + a, b, c = ln[0], ln[1], ln[2] + pt1 = np.asarray([0, -c / b]).astype(np.int32) + pt2 = np.asarray([w1, (-a * w1 - c) / b]).astype(np.int32) + + img0 = cv2.circle(img0, tuple(pt0.astype(np.int32)), 5, color, 2) + img1 = cv2.line(img1, tuple(pt1), tuple(pt2), color, 2) + return img0, img1 + + +def draw_epipolar_lines(F, img0, img1, num=20): + img0, img1 = img0.copy(), img1.copy() + h0, w0, _ = img0.shape + h1, w1, _ = img1.shape + + for k in range(num): + color = np.random.randint(0, 255, [3], dtype=np.int32) + color = [int(c) for c in color] + pt = np.random.uniform(0, 1, 2) + pt[0] *= w0 + pt[1] *= h0 + pt = pt.astype(np.int32) + img0, img1 = draw_epipolar_line(F, img0, img1, pt, color) + + return img0, img1 + + +def compute_F(K1, K2, Rt0, Rt1=None): + if Rt1 is None: + R, t = Rt0[:, :3], Rt0[:, 3:] + else: + Rt = compute_dR_dt(Rt0, Rt1) + R, t = Rt[:, :3], Rt[:, 3:] + A = K1 @ R.T @ t # [3,1] + C = np.asarray([[0, -A[2, 0], A[1, 0]], [A[2, 0], 0, -A[0, 0]], + [-A[1, 0], A[0, 0], 0]]) + F = (np.linalg.inv(K2)).T @ R @ K1.T @ C + return F + + +def compute_dR_dt(Rt0, Rt1): + R0, t0 = Rt0[:, :3], Rt0[:, 3:] + R1, t1 = Rt1[:, :3], Rt1[:, 3:] + dR = np.dot(R1, R0.T) + dt = t1 - np.dot(dR, t0) + return np.concatenate([dR, dt], -1) + + +def concat_images(img0, img1, vert=False): + if not vert: + h0, h1 = img0.shape[0], img1.shape[0], + if h0 < h1: + img0 = cv2.copyMakeBorder( + img0, + 0, + h1 - h0, + 0, + 0, + borderType=cv2.BORDER_CONSTANT, + value=0) + if h1 < h0: + img1 = cv2.copyMakeBorder( + img1, + 0, + h0 - h1, + 0, + 0, + borderType=cv2.BORDER_CONSTANT, + value=0) + img = np.concatenate([img0, img1], axis=1) + else: + w0, w1 = img0.shape[1], img1.shape[1] + if w0 < w1: + img0 = cv2.copyMakeBorder( + img0, + 0, + 0, + 0, + w1 - w0, + borderType=cv2.BORDER_CONSTANT, + value=0) + if w1 < w0: + img1 = cv2.copyMakeBorder( + img1, + 0, + 0, + 0, + w0 - w1, + borderType=cv2.BORDER_CONSTANT, + value=0) + img = np.concatenate([img0, img1], axis=0) + + return img + + +def concat_images_list(*args, vert=False): + if len(args) == 1: + return args[0] + img_out = args[0] + for img in args[1:]: + img_out = concat_images(img_out, img, vert) + return img_out + + +def pose_inverse(pose): + R = pose[:, :3].T + t = -R @ pose[:, 3:] + return np.concatenate([R, t], -1) + + +def project_points(pts, RT, K): + pts = np.matmul(pts, RT[:, :3].transpose()) + RT[:, 3:].transpose() + pts = np.matmul(pts, K.transpose()) + dpt = pts[:, 2] + mask0 = (np.abs(dpt) < 1e-4) & (np.abs(dpt) > 0) + if np.sum(mask0) > 0: + dpt[mask0] = 1e-4 + mask1 = (np.abs(dpt) > -1e-4) & (np.abs(dpt) < 0) + if np.sum(mask1) > 0: + dpt[mask1] = -1e-4 + pts2d = pts[:, :2] / dpt[:, None] + return pts2d, dpt + + +def draw_keypoints(img, kps, colors=None, radius=2): + out_img = img.copy() + for pi, pt in enumerate(kps): + pt = np.round(pt).astype(np.int32) + if colors is not None: + color = [int(c) for c in colors[pi]] + cv2.circle(out_img, tuple(pt), radius, color, -1) + else: + cv2.circle(out_img, tuple(pt), radius, (0, 255, 0), -1) + return out_img + + +def output_points(fn, pts, colors=None): + with open(fn, 'w') as f: + for pi, pt in enumerate(pts): + f.write(f'{pt[0]:.6f} {pt[1]:.6f} {pt[2]:.6f} ') + if colors is not None: + f.write( + f'{int(colors[pi,0])} {int(colors[pi,1])} {int(colors[pi,2])}' + ) + f.write('\n') + + +DEPTH_MAX, DEPTH_MIN = 2.4, 0.6 +DEPTH_VALID_MAX, DEPTH_VALID_MIN = 2.37, 0.63 + + +def read_depth_objaverse(depth_fn): + depth = imread(depth_fn) + depth = depth.astype( + np.float32) / 65535 * (DEPTH_MAX - DEPTH_MIN) + DEPTH_MIN + mask = (depth > DEPTH_VALID_MIN) & (depth < DEPTH_VALID_MAX) + return depth, mask + + +def mask_depth_to_pts(mask, depth, K, rgb=None): + hs, ws = np.nonzero(mask) + depth = depth[hs, ws] + pts = np.asarray([ws, hs, depth], np.float32).transpose() + pts[:, :2] *= pts[:, 2:] + if rgb is not None: + return np.dot(pts, np.linalg.inv(K).transpose()), rgb[hs, ws] + else: + return np.dot(pts, np.linalg.inv(K).transpose()) + + +def transform_points_pose(pts, pose): + R, t = pose[:, :3], pose[:, 3] + if len(pts.shape) == 1: + return (R @ pts[:, None] + t[:, None])[:, 0] + return pts @ R.T + t[None, :] + + +def pose_apply(pose, pts): + return transform_points_pose(pts, pose) + + +def downsample_gaussian_blur(img, ratio): + sigma = (1 / ratio) / 3 + # ksize=np.ceil(2*sigma) + ksize = int(np.ceil(((sigma - 0.8) / 0.3 + 1) * 2 + 1)) + ksize = ksize + 1 if ksize % 2 == 0 else ksize + img = cv2.GaussianBlur( + img, (ksize, ksize), sigma, borderType=cv2.BORDER_REFLECT101) + return img diff --git a/modelscope/models/cv/image_to_3d/ldm/models/autoencoder.py b/modelscope/models/cv/image_to_3d/ldm/models/autoencoder.py new file mode 100644 index 000000000..6d5a538e1 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/models/autoencoder.py @@ -0,0 +1,558 @@ +from contextlib import contextmanager + +import pytorch_lightning as pl +import torch +import torch.nn.functional as F +from taming.modules.vqvae.quantize import VectorQuantizer2 as VectorQuantizer + +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.model import ( + Decoder, Encoder) +from modelscope.models.cv.image_to_3d.ldm.modules.distributions.distributions import \ + DiagonalGaussianDistribution +from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config + + +class VQModel(pl.LightningModule): + + def __init__( + self, + ddconfig, + lossconfig, + n_embed, + embed_dim, + ckpt_path=None, + ignore_keys=[], + image_key='image', + colorize_nlabels=None, + monitor=None, + batch_resize_range=None, + scheduler_config=None, + lr_g_factor=1.0, + remap=None, + sane_index_shape=False, # tell vector quantizer to return indices as bhw + use_ema=False): + super().__init__() + self.embed_dim = embed_dim + self.n_embed = n_embed + self.image_key = image_key + self.encoder = Encoder(**ddconfig) + self.decoder = Decoder(**ddconfig) + self.loss = instantiate_from_config(lossconfig) + self.quantize = VectorQuantizer( + n_embed, + embed_dim, + beta=0.25, + remap=remap, + sane_index_shape=sane_index_shape) + self.quant_conv = torch.nn.Conv2d(ddconfig['z_channels'], embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, + ddconfig['z_channels'], 1) + if colorize_nlabels is not None: + assert type(colorize_nlabels) == int + self.register_buffer('colorize', + torch.randn(3, colorize_nlabels, 1, 1)) + if monitor is not None: + self.monitor = monitor + self.batch_resize_range = batch_resize_range + if self.batch_resize_range is not None: + print( + f'{self.__class__.__name__}: Using per-batch resizing in range {batch_resize_range}.' + ) + + self.use_ema = use_ema + if self.use_ema: + self.model_ema = LitEma(self) + print(f'Keeping EMAs of {len(list(self.model_ema.buffers()))}.') + + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + self.scheduler_config = scheduler_config + self.lr_g_factor = lr_g_factor + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print(f'{context}: Switched to EMA weights') + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print(f'{context}: Restored training weights') + + def init_from_ckpt(self, path, ignore_keys=list()): + sd = torch.load(path, map_location='cpu')['state_dict'] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print('Deleting key {} from state_dict.'.format(k)) + del sd[k] + missing, unexpected = self.load_state_dict(sd, strict=False) + print( + f'Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys' + ) + if len(missing) > 0: + print(f'Missing Keys: {missing}') + print(f'Unexpected Keys: {unexpected}') + + def on_train_batch_end(self, *args, **kwargs): + if self.use_ema: + self.model_ema(self) + + def encode(self, x): + h = self.encoder(x) + h = self.quant_conv(h) + quant, emb_loss, info = self.quantize(h) + return quant, emb_loss, info + + def encode_to_prequant(self, x): + h = self.encoder(x) + h = self.quant_conv(h) + return h + + def decode(self, quant): + quant = self.post_quant_conv(quant) + dec = self.decoder(quant) + return dec + + def decode_code(self, code_b): + quant_b = self.quantize.embed_code(code_b) + dec = self.decode(quant_b) + return dec + + def forward(self, input, return_pred_indices=False): + quant, diff, (_, _, ind) = self.encode(input) + dec = self.decode(quant) + if return_pred_indices: + return dec, diff, ind + return dec, diff + + def get_input(self, batch, k): + x = batch[k] + if len(x.shape) == 3: + x = x[..., None] + x = x.permute(0, 3, 1, + 2).to(memory_format=torch.contiguous_format).float() + if self.batch_resize_range is not None: + lower_size = self.batch_resize_range[0] + upper_size = self.batch_resize_range[1] + if self.global_step <= 4: + # do the first few batches with max size to avoid later oom + new_resize = upper_size + else: + new_resize = np.random.choice( + np.arange(lower_size, upper_size + 16, 16)) + if new_resize != x.shape[2]: + x = F.interpolate(x, size=new_resize, mode='bicubic') + x = x.detach() + return x + + def training_step(self, batch, batch_idx, optimizer_idx): + # https://github.com/pytorch/pytorch/issues/37142 + # try not to fool the heuristics + x = self.get_input(batch, self.image_key) + xrec, qloss, ind = self(x, return_pred_indices=True) + + if optimizer_idx == 0: + # autoencode + aeloss, log_dict_ae = self.loss( + qloss, + x, + xrec, + optimizer_idx, + self.global_step, + last_layer=self.get_last_layer(), + split='train', + predicted_indices=ind) + + self.log_dict( + log_dict_ae, + prog_bar=False, + logger=True, + on_step=True, + on_epoch=True) + return aeloss + + if optimizer_idx == 1: + # discriminator + discloss, log_dict_disc = self.loss( + qloss, + x, + xrec, + optimizer_idx, + self.global_step, + last_layer=self.get_last_layer(), + split='train') + self.log_dict( + log_dict_disc, + prog_bar=False, + logger=True, + on_step=True, + on_epoch=True) + return discloss + + def validation_step(self, batch, batch_idx): + log_dict = self._validation_step(batch, batch_idx) + # with self.ema_scope(): + # log_dict_ema = self._validation_step( + # batch, batch_idx, suffix='_ema') + return log_dict + + def _validation_step(self, batch, batch_idx, suffix=''): + x = self.get_input(batch, self.image_key) + xrec, qloss, ind = self(x, return_pred_indices=True) + aeloss, log_dict_ae = self.loss( + qloss, + x, + xrec, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split='val' + suffix, + predicted_indices=ind) + + discloss, log_dict_disc = self.loss( + qloss, + x, + xrec, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split='val' + suffix, + predicted_indices=ind) + rec_loss = log_dict_ae[f'val{suffix}/rec_loss'] + self.log( + f'val{suffix}/rec_loss', + rec_loss, + prog_bar=True, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True) + self.log( + f'val{suffix}/aeloss', + aeloss, + prog_bar=True, + logger=True, + on_step=False, + on_epoch=True, + sync_dist=True) + if version.parse(pl.__version__) >= version.parse('1.4.0'): + del log_dict_ae[f'val{suffix}/rec_loss'] + self.log_dict(log_dict_ae) + self.log_dict(log_dict_disc) + return self.log_dict + + def configure_optimizers(self): + lr_d = self.learning_rate + lr_g = self.lr_g_factor * self.learning_rate + print('lr_d', lr_d) + print('lr_g', lr_g) + opt_ae = torch.optim.Adam( + list(self.encoder.parameters()) + list(self.decoder.parameters()) + + list(self.quantize.parameters()) + + list(self.quant_conv.parameters()) + + list(self.post_quant_conv.parameters()), + lr=lr_g, + betas=(0.5, 0.9)) + opt_disc = torch.optim.Adam( + self.loss.discriminator.parameters(), lr=lr_d, betas=(0.5, 0.9)) + + if self.scheduler_config is not None: + scheduler = instantiate_from_config(self.scheduler_config) + + print('Setting up LambdaLR scheduler...') + scheduler = [ + { + 'scheduler': + LambdaLR(opt_ae, lr_lambda=scheduler.schedule), + 'interval': 'step', + 'frequency': 1 + }, + { + 'scheduler': + LambdaLR(opt_disc, lr_lambda=scheduler.schedule), + 'interval': 'step', + 'frequency': 1 + }, + ] + return [opt_ae, opt_disc], scheduler + return [opt_ae, opt_disc], [] + + def get_last_layer(self): + return self.decoder.conv_out.weight + + def log_images(self, batch, only_inputs=False, plot_ema=False, **kwargs): + log = dict() + x = self.get_input(batch, self.image_key) + x = x.to(self.device) + if only_inputs: + log['inputs'] = x + return log + xrec, _ = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec.shape[1] > 3 + x = self.to_rgb(x) + xrec = self.to_rgb(xrec) + log['inputs'] = x + log['reconstructions'] = xrec + if plot_ema: + with self.ema_scope(): + xrec_ema, _ = self(x) + if x.shape[1] > 3: + xrec_ema = self.to_rgb(xrec_ema) + log['reconstructions_ema'] = xrec_ema + return log + + def to_rgb(self, x): + assert self.image_key == 'segmentation' + if not hasattr(self, 'colorize'): + self.register_buffer('colorize', + torch.randn(3, x.shape[1], 1, 1).to(x)) + x = F.conv2d(x, weight=self.colorize) + x = 2. * (x - x.min()) / (x.max() - x.min()) - 1. + return x + + +class VQModelInterface(VQModel): + + def __init__(self, embed_dim, *args, **kwargs): + super().__init__(embed_dim=embed_dim, *args, **kwargs) + self.embed_dim = embed_dim + + def encode(self, x): + h = self.encoder(x) + h = self.quant_conv(h) + return h + + def decode(self, h, force_not_quantize=False): + # also go through quantization layer + if not force_not_quantize: + quant, emb_loss, info = self.quantize(h) + else: + quant = h + quant = self.post_quant_conv(quant) + dec = self.decoder(quant) + return dec + + +class AutoencoderKL(pl.LightningModule): + + def __init__( + self, + ddconfig, + lossconfig, + embed_dim, + ckpt_path=None, + ignore_keys=[], + image_key='image', + colorize_nlabels=None, + monitor=None, + ): + super().__init__() + self.image_key = image_key + self.encoder = Encoder(**ddconfig) + self.decoder = Decoder(**ddconfig) + self.loss = instantiate_from_config(lossconfig) + assert ddconfig['double_z'] + self.quant_conv = torch.nn.Conv2d(2 * ddconfig['z_channels'], + 2 * embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, + ddconfig['z_channels'], 1) + self.embed_dim = embed_dim + if colorize_nlabels is not None: + assert type(colorize_nlabels) == int + self.register_buffer('colorize', + torch.randn(3, colorize_nlabels, 1, 1)) + if monitor is not None: + self.monitor = monitor + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def init_from_ckpt(self, path, ignore_keys=list()): + sd = torch.load(path, map_location='cpu')['state_dict'] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print('Deleting key {} from state_dict.'.format(k)) + del sd[k] + self.load_state_dict(sd, strict=False) + print(f'Restored from {path}') + + def encode(self, x): + h = self.encoder(x) + moments = self.quant_conv(h) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z): + z = self.post_quant_conv(z) + dec = self.decoder(z) + return dec + + def forward(self, input, sample_posterior=True): + posterior = self.encode(input) + if sample_posterior: + z = posterior.sample() + else: + z = posterior.mode() + dec = self.decode(z) + return dec, posterior + + def get_input(self, batch, k): + x = batch[k] + if len(x.shape) == 3: + x = x[..., None] + x = x.permute(0, 3, 1, + 2).to(memory_format=torch.contiguous_format).float() + return x + + def training_step(self, batch, batch_idx, optimizer_idx): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + + if optimizer_idx == 0: + # train encoder+decoder+logvar + aeloss, log_dict_ae = self.loss( + inputs, + reconstructions, + posterior, + optimizer_idx, + self.global_step, + last_layer=self.get_last_layer(), + split='train') + self.log( + 'aeloss', + aeloss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=True) + self.log_dict( + log_dict_ae, + prog_bar=False, + logger=True, + on_step=True, + on_epoch=False) + return aeloss + + if optimizer_idx == 1: + # train the discriminator + discloss, log_dict_disc = self.loss( + inputs, + reconstructions, + posterior, + optimizer_idx, + self.global_step, + last_layer=self.get_last_layer(), + split='train') + + self.log( + 'discloss', + discloss, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=True) + self.log_dict( + log_dict_disc, + prog_bar=False, + logger=True, + on_step=True, + on_epoch=False) + return discloss + + def validation_step(self, batch, batch_idx): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + aeloss, log_dict_ae = self.loss( + inputs, + reconstructions, + posterior, + 0, + self.global_step, + last_layer=self.get_last_layer(), + split='val') + + discloss, log_dict_disc = self.loss( + inputs, + reconstructions, + posterior, + 1, + self.global_step, + last_layer=self.get_last_layer(), + split='val') + + self.log('val/rec_loss', log_dict_ae['val/rec_loss']) + self.log_dict(log_dict_ae) + self.log_dict(log_dict_disc) + return self.log_dict + + def configure_optimizers(self): + lr = self.learning_rate + opt_ae = torch.optim.Adam( + list(self.encoder.parameters()) + list(self.decoder.parameters()) + + list(self.quant_conv.parameters()) + + list(self.post_quant_conv.parameters()), + lr=lr, + betas=(0.5, 0.9)) + opt_disc = torch.optim.Adam( + self.loss.discriminator.parameters(), lr=lr, betas=(0.5, 0.9)) + return [opt_ae, opt_disc], [] + + def get_last_layer(self): + return self.decoder.conv_out.weight + + @torch.no_grad() + def log_images(self, batch, only_inputs=False, **kwargs): + log = dict() + x = self.get_input(batch, self.image_key) + x = x.to(self.device) + if not only_inputs: + xrec, posterior = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec.shape[1] > 3 + x = self.to_rgb(x) + xrec = self.to_rgb(xrec) + log['samples'] = self.decode(torch.randn_like(posterior.sample())) + log['reconstructions'] = xrec + log['inputs'] = x + return log + + def to_rgb(self, x): + assert self.image_key == 'segmentation' + if not hasattr(self, 'colorize'): + self.register_buffer('colorize', + torch.randn(3, x.shape[1], 1, 1).to(x)) + x = F.conv2d(x, weight=self.colorize) + x = 2. * (x - x.min()) / (x.max() - x.min()) - 1. + return x + + +class IdentityFirstStage(torch.nn.Module): + + def __init__(self, *args, vq_interface=False, **kwargs): + self.vq_interface = vq_interface # TODO: Should be true by default but check to not break older stuff + super().__init__() + + def encode(self, x, *args, **kwargs): + return x + + def decode(self, x, *args, **kwargs): + return x + + def quantize(self, x, *args, **kwargs): + if self.vq_interface: + return x, None, [None, None, None] + return x + + def forward(self, x, *args, **kwargs): + return x diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/__init__.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer.py new file mode 100644 index 000000000..9783ee5b3 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer.py @@ -0,0 +1,973 @@ +from pathlib import Path + +import numpy as np +import pytorch_lightning as pl +import torch +import torch.nn as nn +import torch.nn.functional as F +from skimage.io import imsave +from torch.optim.lr_scheduler import LambdaLR +from tqdm import tqdm + +from modelscope.models.cv.image_to_3d.ldm.base_utils import ( + concat_images_list, read_pickle) +from modelscope.models.cv.image_to_3d.ldm.models.diffusion.sync_dreamer_network import ( + FrustumTV3DNet, NoisyTargetViewEncoder, SpatialTime3DNet) +from modelscope.models.cv.image_to_3d.ldm.models.diffusion.sync_dreamer_utils import ( + create_target_volume, get_warp_coordinates) +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import ( + make_ddim_timesteps, timestep_embedding) +from modelscope.models.cv.image_to_3d.ldm.modules.encoders.modules import \ + FrozenCLIPImageEmbedder +from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +def disable_training_module(module: nn.Module): + module = module.eval() + module.train = disabled_train + for para in module.parameters(): + para.requires_grad = False + return module + + +def repeat_to_batch(tensor, B, VN): + t_shape = tensor.shape + ones = [1 for _ in range(len(t_shape) - 1)] + tensor_new = tensor.view(B, 1, *t_shape[1:]).repeat(1, VN, *ones).view( + B * VN, *t_shape[1:]) + return tensor_new + + +class UNetWrapper(nn.Module): + + def __init__(self, + diff_model_config, + drop_conditions=False, + drop_scheme='default', + use_zero_123=True): + super().__init__() + self.diffusion_model = instantiate_from_config(diff_model_config) + self.drop_conditions = drop_conditions + self.drop_scheme = drop_scheme + self.use_zero_123 = use_zero_123 + + def drop(self, cond, mask): + shape = cond.shape + B = shape[0] + cond = mask.view(B, *[1 for _ in range(len(shape) - 1)]) * cond + return cond + + def get_trainable_parameters(self): + return self.diffusion_model.get_trainable_parameters() + + def get_drop_scheme(self, B, device): + if self.drop_scheme == 'default': + random = torch.rand(B, dtype=torch.float32, device=device) + drop_clip = (random > 0.15) & (random <= 0.2) + drop_volume = (random > 0.1) & (random <= 0.15) + drop_concat = (random > 0.05) & (random <= 0.1) + drop_all = random <= 0.05 + else: + raise NotImplementedError + return drop_clip, drop_volume, drop_concat, drop_all + + def forward(self, + x, + t, + clip_embed, + volume_feats, + x_concat, + is_train=False): + """ + + @param x: B,4,H,W + @param t: B, + @param clip_embed: B,M,768 + @param volume_feats: B,C,D,H,W + @param x_concat: B,C,H,W + @param is_train: + @return: + """ + if self.drop_conditions and is_train: + B = x.shape[0] + drop_clip, drop_volume, drop_concat, drop_all = self.get_drop_scheme( + B, x.device) + + clip_mask = 1.0 - (drop_clip | drop_all).float() + clip_embed = self.drop(clip_embed, clip_mask) + + volume_mask = 1.0 - (drop_volume | drop_all).float() + for k, v in volume_feats.items(): + volume_feats[k] = self.drop(v, mask=volume_mask) + + concat_mask = 1.0 - (drop_concat | drop_all).float() + x_concat = self.drop(x_concat, concat_mask) + + if self.use_zero_123: + # zero123 does not multiply this when encoding, maybe a bug for zero123 + first_stage_scale_factor = 0.18215 + x_concat_ = x_concat * 1.0 + x_concat_[:, :4] = x_concat_[:, :4] / first_stage_scale_factor + else: + x_concat_ = x_concat + + x = torch.cat([x, x_concat_], 1) + pred = self.diffusion_model(x, t, clip_embed, source_dict=volume_feats) + return pred + + def predict_with_unconditional_scale(self, x, t, clip_embed, volume_feats, + x_concat, unconditional_scale): + x_ = torch.cat([x] * 2, 0) + t_ = torch.cat([t] * 2, 0) + clip_embed_ = torch.cat([clip_embed, torch.zeros_like(clip_embed)], 0) + + v_ = {} + for k, v in volume_feats.items(): + v_[k] = torch.cat([v, torch.zeros_like(v)], 0) + + x_concat_ = torch.cat([x_concat, torch.zeros_like(x_concat)], 0) + if self.use_zero_123: + # zero123 does not multiply this when encoding, maybe a bug for zero123 + first_stage_scale_factor = 0.18215 + x_concat_[:, :4] = x_concat_[:, :4] / first_stage_scale_factor + x_ = torch.cat([x_, x_concat_], 1) + s, s_uc = self.diffusion_model( + x_, t_, clip_embed_, source_dict=v_).chunk(2) + s = s_uc + unconditional_scale * (s - s_uc) + return s + + +class SpatialVolumeNet(nn.Module): + + def __init__( + self, + time_dim, + view_dim, + view_num, + input_image_size=256, + frustum_volume_depth=48, + spatial_volume_size=32, + spatial_volume_length=0.5, + frustum_volume_length=0.86603 # sqrt(3)/2 + ): + super().__init__() + self.target_encoder = NoisyTargetViewEncoder( + time_dim, view_dim, output_dim=16) + self.spatial_volume_feats = SpatialTime3DNet( + input_dim=16 * view_num, + time_dim=time_dim, + dims=(64, 128, 256, 512)) + self.frustum_volume_feats = FrustumTV3DNet( + 64, time_dim, view_dim, dims=(64, 128, 256, 512)) + + self.frustum_volume_length = frustum_volume_length + self.input_image_size = input_image_size + self.spatial_volume_size = spatial_volume_size + self.spatial_volume_length = spatial_volume_length + + self.frustum_volume_size = self.input_image_size // 8 + self.frustum_volume_depth = frustum_volume_depth + self.time_dim = time_dim + self.view_dim = view_dim + # our rendered images are 1.5 away from the origin, we assume camera is 1.5 away from the origin + self.default_origin_depth = 1.5 + + def construct_spatial_volume(self, x, t_embed, v_embed, target_poses, + target_Ks): + """ + @param x: B,N,4,H,W + @param t_embed: B,t_dim + @param v_embed: B,N,v_dim + @param target_poses: N,3,4 + @param target_Ks: N,3,3 + @return: + """ + B, N, _, H, W = x.shape + V = self.spatial_volume_size + device = x.device + + spatial_volume_verts = torch.linspace( + -self.spatial_volume_length, + self.spatial_volume_length, + V, + dtype=torch.float32, + device=device) + spatial_volume_verts = torch.stack( + torch.meshgrid(spatial_volume_verts, spatial_volume_verts, + spatial_volume_verts), -1) + spatial_volume_verts = spatial_volume_verts.reshape(1, V**3, + 3)[:, :, (2, 1, 0)] + spatial_volume_verts = spatial_volume_verts.view( + 1, V, V, V, 3).permute(0, 4, 1, 2, 3).repeat(B, 1, 1, 1, 1) + + # encode source features + t_embed_ = t_embed.view(B, 1, self.time_dim).repeat(1, N, 1).view( + B, N, self.time_dim) + # v_embed_ = v_embed.view(1, N, self.view_dim).repeat(B, 1, 1).view(B, N, self.view_dim) + v_embed_ = v_embed + target_Ks = target_Ks.unsqueeze(0).repeat(B, 1, 1, 1) + target_poses = target_poses.unsqueeze(0).repeat(B, 1, 1, 1) + + # extract 2D image features + spatial_volume_feats = [] + # project source features + for ni in range(0, N): + pose_source_ = target_poses[:, ni] + K_source_ = target_Ks[:, ni] + x_ = self.target_encoder(x[:, ni], t_embed_[:, ni], v_embed_[:, + ni]) + C = x_.shape[1] + + coords_source = get_warp_coordinates( + spatial_volume_verts, x_.shape[-1], self.input_image_size, + K_source_, pose_source_).view(B, V, V * V, 2) + unproj_feats_ = F.grid_sample( + x_, + coords_source, + mode='bilinear', + padding_mode='zeros', + align_corners=True) + unproj_feats_ = unproj_feats_.view(B, C, V, V, V) + spatial_volume_feats.append(unproj_feats_) + + spatial_volume_feats = torch.stack(spatial_volume_feats, + 1) # B,N,C,V,V,V + N = spatial_volume_feats.shape[1] + spatial_volume_feats = spatial_volume_feats.view(B, N * C, V, V, V) + + spatial_volume_feats = self.spatial_volume_feats( + spatial_volume_feats, t_embed) # b,64,32,32,32 + return spatial_volume_feats + + def construct_view_frustum_volume(self, spatial_volume, t_embed, v_embed, + poses, Ks, target_indices): + """ + @param spatial_volume: B,C,V,V,V + @param t_embed: B,t_dim + @param v_embed: B,N,v_dim + @param poses: N,3,4 + @param Ks: N,3,3 + @param target_indices: B,TN + @return: B*TN,C,H,W + """ + B, TN = target_indices.shape + H, W = self.frustum_volume_size, self.frustum_volume_size + D = self.frustum_volume_depth + V = self.spatial_volume_size + + near = torch.ones( + B * TN, + 1, + H, + W, + dtype=spatial_volume.dtype, + device=spatial_volume.device + ) * self.default_origin_depth - self.frustum_volume_length + far = torch.ones( + B * TN, + 1, + H, + W, + dtype=spatial_volume.dtype, + device=spatial_volume.device + ) * self.default_origin_depth + self.frustum_volume_length + + target_indices = target_indices.view(B * TN) # B*TN + poses_ = poses[target_indices] # B*TN,3,4 + Ks_ = Ks[target_indices] # B*TN,3,4 + volume_xyz, volume_depth = create_target_volume( + D, self.frustum_volume_size, self.input_image_size, poses_, Ks_, + near, far) # B*TN,3 or 1,D,H,W + + # since the spatial volume is constructed in [-spatial_volume_length,spatial_volume_length] + volume_xyz_ = volume_xyz / self.spatial_volume_length + volume_xyz_ = volume_xyz_.permute(0, 2, 3, 4, 1) # B*TN,D,H,W,3 + spatial_volume_ = spatial_volume.unsqueeze(1).repeat( + 1, TN, 1, 1, 1, 1).view(B * TN, -1, V, V, V) + volume_feats = F.grid_sample( + spatial_volume_, + volume_xyz_, + mode='bilinear', + padding_mode='zeros', + align_corners=True) # B*TN,C,D,H,W + + v_embed_ = v_embed[torch.arange(B)[:, None], + target_indices.view(B, TN)].view(B * TN, -1) # B*TN + t_embed_ = t_embed.unsqueeze(1).repeat(1, TN, 1).view(B * TN, -1) + volume_feats_dict = self.frustum_volume_feats(volume_feats, t_embed_, + v_embed_) + return volume_feats_dict, volume_depth + + +""" + SyncDreamer is a SoTA Novel View Synthesis model which can generate 16 consistent views seamlessly. + Please refer to: https://arxiv.org/abs/2309.03453 for more technique details. +""" + + +class SyncMultiviewDiffusion(pl.LightningModule): + + def __init__( + self, + unet_config, + scheduler_config, + finetune_unet=False, + finetune_projection=True, + view_num=16, + image_size=256, + cfg_scale=3.0, + output_num=8, + batch_view_num=4, + drop_conditions=False, + drop_scheme='default', + clip_image_encoder_path='/apdcephfs/private_rondyliu/projects/clip/ViT-L-14.pt' + ): + super().__init__() + + self.finetune_unet = finetune_unet + self.finetune_projection = finetune_projection + + self.view_num = view_num + self.viewpoint_dim = 4 + self.output_num = output_num + self.image_size = image_size + + self.batch_view_num = batch_view_num + self.cfg_scale = cfg_scale + + self.clip_image_encoder_path = clip_image_encoder_path + + self._init_time_step_embedding() + self._init_first_stage() + self._init_schedule() + self._init_multiview() + self._init_clip_image_encoder() + self._init_clip_projection() + + self.spatial_volume = SpatialVolumeNet(self.time_embed_dim, + self.viewpoint_dim, + self.view_num) + self.model = UNetWrapper( + unet_config, + drop_conditions=drop_conditions, + drop_scheme=drop_scheme) + self.scheduler_config = scheduler_config + + latent_size = image_size // 8 + self.ddim = SyncDDIMSampler( + self, 200, 'uniform', 1.0, latent_size=latent_size) + + def _init_clip_projection(self): + self.cc_projection = nn.Linear(772, 768) + nn.init.eye_(list(self.cc_projection.parameters())[0][:768, :768]) + nn.init.zeros_(list(self.cc_projection.parameters())[1]) + self.cc_projection.requires_grad_(True) + + if not self.finetune_projection: + disable_training_module(self.cc_projection) + + def _init_multiview(self): + K, azs, _, _, poses = read_pickle( + self.clip_image_encoder_path.replace( + 'ViT-L-14.pt', f'camera-{self.view_num}.pkl')) + default_image_size = 256 + ratio = self.image_size / default_image_size + K = np.diag([ratio, ratio, 1]) @ K + K = torch.from_numpy(K.astype(np.float32)) # [3,3] + K = K.unsqueeze(0).repeat(self.view_num, 1, 1) # N,3,3 + poses = torch.from_numpy(poses.astype(np.float32)) # N,3,4 + self.register_buffer('poses', poses) + self.register_buffer('Ks', K) + azs = (azs + np.pi) % ( + np.pi * 2) - np.pi # scale to [-pi,pi] and the index=0 has az=0 + self.register_buffer('azimuth', + torch.from_numpy(azs.astype(np.float32))) + + def get_viewpoint_embedding(self, batch_size, elevation_ref): + """ + @param batch_size: + @param elevation_ref: B + @return: + """ + azimuth_input = self.azimuth[0].unsqueeze(0) # 1 + azimuth_target = self.azimuth # N + elevation_input = -elevation_ref # note that zero123 use a negative elevation here!!! + elevation_target = -np.deg2rad(30) + d_e = elevation_target - elevation_input # B + N = self.azimuth.shape[0] + B = batch_size + d_e = d_e.unsqueeze(1).repeat(1, N) + d_a = azimuth_target - azimuth_input # N + d_a = d_a.unsqueeze(0).repeat(B, 1) + d_z = torch.zeros_like(d_a) + embedding = torch.stack( + [d_e, torch.sin(d_a), torch.cos(d_a), d_z], -1) # B,N,4 + return embedding + + def _init_first_stage(self): + first_stage_config = { + 'target': + 'modelscope.models.cv.image_to_3d.ldm.models.autoencoder.AutoencoderKL', + 'params': { + 'embed_dim': 4, + 'monitor': 'val/rec_loss', + 'ddconfig': { + 'double_z': True, + 'z_channels': 4, + 'resolution': self.image_size, + 'in_channels': 3, + 'out_ch': 3, + 'ch': 128, + 'ch_mult': [1, 2, 4, 4], + 'num_res_blocks': 2, + 'attn_resolutions': [], + 'dropout': 0.0 + }, + 'lossconfig': { + 'target': 'torch.nn.Identity' + }, + } + } + self.first_stage_scale_factor = 0.18215 + self.first_stage_model = instantiate_from_config(first_stage_config) + self.first_stage_model = disable_training_module( + self.first_stage_model) + + def _init_clip_image_encoder(self): + self.clip_image_encoder = FrozenCLIPImageEmbedder( + model=self.clip_image_encoder_path) + self.clip_image_encoder = disable_training_module( + self.clip_image_encoder) + + def _init_schedule(self): + self.num_timesteps = 1000 + linear_start = 0.00085 + linear_end = 0.0120 + num_timesteps = 1000 + betas = torch.linspace( + linear_start**0.5, + linear_end**0.5, + num_timesteps, + dtype=torch.float32)**2 # T + assert betas.shape[0] == self.num_timesteps + + # all in float64 first + alphas = 1. - betas + alphas_cumprod = torch.cumprod(alphas, dim=0) # T + alphas_cumprod_prev = torch.cat( + [torch.ones(1, dtype=torch.float64), alphas_cumprod[:-1]], 0) + posterior_variance = betas * (1. - alphas_cumprod_prev) / ( + 1. - alphas_cumprod) # T + posterior_log_variance_clipped = torch.log( + torch.clamp(posterior_variance, min=1e-20)) + posterior_log_variance_clipped = torch.clamp( + posterior_log_variance_clipped, min=-10) + + self.register_buffer('betas', betas.float()) + self.register_buffer('alphas', alphas.float()) + self.register_buffer('alphas_cumprod', alphas_cumprod.float()) + self.register_buffer('sqrt_alphas_cumprod', + torch.sqrt(alphas_cumprod).float()) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + torch.sqrt(1 - alphas_cumprod).float()) + self.register_buffer('posterior_variance', posterior_variance.float()) + self.register_buffer('posterior_log_variance_clipped', + posterior_log_variance_clipped.float()) + + def _init_time_step_embedding(self): + self.time_embed_dim = 256 + self.time_embed = nn.Sequential( + nn.Linear(self.time_embed_dim, self.time_embed_dim), + nn.SiLU(True), + nn.Linear(self.time_embed_dim, self.time_embed_dim), + ) + + def encode_first_stage(self, x, sample=True): + with torch.no_grad(): + posterior = self.first_stage_model.encode(x) # b,4,h//8,w//8 + if sample: + return posterior.sample().detach( + ) * self.first_stage_scale_factor + else: + return posterior.mode().detach( + ) * self.first_stage_scale_factor + + def decode_first_stage(self, z): + with torch.no_grad(): + z = 1. / self.first_stage_scale_factor * z + return self.first_stage_model.decode(z) + + def prepare(self, batch): + # encode target + if 'target_image' in batch: + image_target = batch['target_image'].permute(0, 1, 4, 2, + 3) # b,n,3,h,w + N = image_target.shape[1] + x = [ + self.encode_first_stage(image_target[:, ni], True) + for ni in range(N) + ] + x = torch.stack(x, 1) # b,n,4,h//8,w//8 + else: + x = None + + image_input = batch['input_image'].permute(0, 3, 1, 2) + elevation_input = batch['input_elevation'][:, 0] # b + x_input = self.encode_first_stage(image_input) + input_info = { + 'image': image_input, + 'elevation': elevation_input, + 'x': x_input + } + with torch.no_grad(): + clip_embed = self.clip_image_encoder.encode(image_input) + return x, clip_embed, input_info + + def embed_time(self, t): + t_embed = timestep_embedding( + t, self.time_embed_dim, repeat_only=False) # B,TED + t_embed = self.time_embed(t_embed) # B,TED + return t_embed + + def get_target_view_feats(self, x_input, spatial_volume, clip_embed, + t_embed, v_embed, target_index): + """ + @param x_input: B,4,H,W + @param spatial_volume: B,C,V,V,V + @param clip_embed: B,1,768 + @param t_embed: B,t_dim + @param v_embed: B,N,v_dim + @param target_index: B,TN + @return: + tensors of size B*TN,* + """ + B, _, H, W = x_input.shape + frustum_volume_feats, frustum_volume_depth = self.spatial_volume.construct_view_frustum_volume( + spatial_volume, t_embed, v_embed, self.poses, self.Ks, + target_index) + + # clip + TN = target_index.shape[1] + v_embed_ = v_embed[torch.arange(B)[:, None], + target_index].view(B * TN, + self.viewpoint_dim) # B*TN,v_dim + clip_embed_ = clip_embed.unsqueeze(1).repeat(1, TN, 1, + 1).view(B * TN, 1, 768) + clip_embed_ = self.cc_projection( + torch.cat([clip_embed_, v_embed_.unsqueeze(1)], -1)) # B*TN,1,768 + + x_input_ = x_input.unsqueeze(1).repeat(1, TN, 1, 1, + 1).view(B * TN, 4, H, W) + + x_concat = x_input_ + return clip_embed_, frustum_volume_feats, x_concat + + def training_step(self, batch): + B = batch['image'].shape[0] + time_steps = torch.randint( + 0, self.num_timesteps, (B, ), device=self.device).long() + + x, clip_embed, input_info = self.prepare(batch) + x_noisy, noise = self.add_noise(x, time_steps) # B,N,4,H,W + + N = self.view_num + target_index = torch.randint( + 0, N, (B, 1), device=self.device).long() # B, 1 + v_embed = self.get_viewpoint_embedding( + B, input_info['elevation']) # N,v_dim + + t_embed = self.embed_time(time_steps) + spatial_volume = self.spatial_volume.construct_spatial_volume( + x_noisy, t_embed, v_embed, self.poses, self.Ks) + + clip_embed, volume_feats, x_concat = self.get_target_view_feats( + input_info['x'], spatial_volume, clip_embed, t_embed, v_embed, + target_index) + + x_noisy_ = x_noisy[torch.arange(B)[:, None], + target_index][:, 0] # B,4,H,W + noise_predict = self.model( + x_noisy_, + time_steps, + clip_embed, + volume_feats, + x_concat, + is_train=True) # B,4,H,W + + noise_target = noise[torch.arange(B)[:, None], + target_index][:, 0] # B,4,H,W + # loss simple for diffusion + loss_simple = torch.nn.functional.mse_loss( + noise_target, noise_predict, reduction='none') + loss = loss_simple.mean() + self.log( + 'sim', + loss_simple.mean(), + prog_bar=True, + logger=True, + on_step=True, + on_epoch=True, + rank_zero_only=True) + + # log others + lr = self.optimizers().param_groups[0]['lr'] + self.log( + 'lr', + lr, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + rank_zero_only=True) + self.log( + 'step', + self.global_step, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + rank_zero_only=True) + return loss + + def add_noise(self, x_start, t): + """ + @param x_start: B,* + @param t: B, + @return: + """ + B = x_start.shape[0] + noise = torch.randn_like(x_start) # B,* + + sqrt_alphas_cumprod_ = self.sqrt_alphas_cumprod[t] # B, + sqrt_one_minus_alphas_cumprod_ = self.sqrt_one_minus_alphas_cumprod[ + t] # B + sqrt_alphas_cumprod_ = sqrt_alphas_cumprod_.view( + B, *[1 for _ in range(len(x_start.shape) - 1)]) + sqrt_one_minus_alphas_cumprod_ = sqrt_one_minus_alphas_cumprod_.view( + B, *[1 for _ in range(len(x_start.shape) - 1)]) + x_noisy = sqrt_alphas_cumprod_ * x_start + sqrt_one_minus_alphas_cumprod_ * noise + return x_noisy, noise + + def sample(self, + batch, + cfg_scale, + batch_view_num, + use_ddim=True, + return_inter_results=False, + inter_interval=50, + inter_view_interval=2): + _, clip_embed, input_info = self.prepare(batch) + if use_ddim: + x_sample, inter = self.ddim.sample( + input_info, + clip_embed, + unconditional_scale=cfg_scale, + log_every_t=inter_interval, + batch_view_num=batch_view_num) + else: + raise NotImplementedError + + N = x_sample.shape[1] + x_sample = torch.stack( + [self.decode_first_stage(x_sample[:, ni]) for ni in range(N)], 1) + if return_inter_results: + torch.cuda.synchronize() + torch.cuda.empty_cache() + inter = torch.stack(inter['x_inter'], 2) # # B,N,T,C,H,W + B, N, T, C, H, W = inter.shape + inter_results = [] + for ni in tqdm(range(0, N, inter_view_interval)): + inter_results_ = [] + for ti in range(T): + inter_results_.append( + self.decode_first_stage(inter[:, ni, ti])) + inter_results.append(torch.stack(inter_results_, + 1)) # B,T,3,H,W + inter_results = torch.stack(inter_results, 1) # B,N,T,3,H,W + return x_sample, inter_results + else: + return x_sample + + def log_image(self, + x_sample, + batch, + step, + output_dir, + only_first_row=False): + + def process(x): + return ((torch.clip(x, min=-1, max=1).cpu().numpy() * 0.5 + 0.5) + * 255).astype(np.uint8) + + B = x_sample.shape[0] + N = x_sample.shape[1] + image_cond = [] + for bi in range(B): + img_pr_ = concat_images_list( + process(batch['ref_image'][bi]), *[ + process(x_sample[bi, ni].permute(1, 2, 0)) + for ni in range(N) + ]) + img_gt_ = concat_images_list( + process(batch['ref_image'][bi]), + *[process(batch['image'][bi, ni]) for ni in range(N)]) + if not only_first_row or bi == 0: + image_cond.append( + concat_images_list(img_gt_, img_pr_, vert=True)) + else: + image_cond.append(img_pr_) + + output_dir = Path(output_dir) + imsave( + str(output_dir / f'{step}.jpg'), + concat_images_list(*image_cond, vert=True)) + + @torch.no_grad() + def validation_step(self, batch, batch_idx): + if batch_idx == 0 and self.global_rank == 0: + self.eval() + step = self.global_step + batch_ = {} + for k, v in batch.items(): + batch_[k] = v[:self.output_num] + x_sample = self.sample(batch_, self.cfg_scale, self.batch_view_num) + output_dir = Path(self.image_dir) / 'images' / 'val' + output_dir.mkdir(exist_ok=True, parents=True) + self.log_image(x_sample, batch, step, output_dir=output_dir) + + def configure_optimizers(self): + lr = self.learning_rate + print(f'setting learning rate to {lr:.4f} ...') + paras = [] + if self.finetune_projection: + paras.append({ + 'params': self.cc_projection.parameters(), + 'lr': lr + }, ) + if self.finetune_unet: + paras.append({'params': self.model.parameters(), 'lr': lr}, ) + else: + paras.append( + { + 'params': self.model.get_trainable_parameters(), + 'lr': lr + }, ) + + paras.append({ + 'params': self.time_embed.parameters(), + 'lr': lr * 10.0 + }, ) + paras.append( + { + 'params': self.spatial_volume.parameters(), + 'lr': lr * 10.0 + }, ) + + opt = torch.optim.AdamW(paras, lr=lr) + + scheduler = instantiate_from_config(self.scheduler_config) + print('Setting up LambdaLR scheduler...') + scheduler = [{ + 'scheduler': LambdaLR(opt, lr_lambda=scheduler.schedule), + 'interval': 'step', + 'frequency': 1 + }] + return [opt], scheduler + + +class SyncDDIMSampler: + + def __init__(self, + model: SyncMultiviewDiffusion, + ddim_num_steps, + ddim_discretize='uniform', + ddim_eta=0., + latent_size=32): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.latent_size = latent_size + self._make_schedule(ddim_num_steps, ddim_discretize, ddim_eta) + self.eta = ddim_eta + + def _make_schedule(self, + ddim_num_steps, + ddim_discretize='uniform', + ddim_eta=0., + verbose=True): + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose) # DT + ddim_timesteps_ = torch.from_numpy( + self.ddim_timesteps.astype(np.int64)) # DT + + alphas_cumprod = self.model.alphas_cumprod # T + assert alphas_cumprod.shape[ + 0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + self.ddim_alphas = alphas_cumprod[ddim_timesteps_].double() # DT + self.ddim_alphas_prev = torch.cat( + [alphas_cumprod[0:1], alphas_cumprod[ddim_timesteps_[:-1]]], + 0) # DT + self.ddim_sigmas = ddim_eta * torch.sqrt( # noqa + (1 - self.ddim_alphas_prev) / (1 - self.ddim_alphas) * # noqa + (1 - self.ddim_alphas / self.ddim_alphas_prev)) # noqa + + self.ddim_alphas_raw = self.model.alphas[ddim_timesteps_].float() # DT + self.ddim_sigmas = self.ddim_sigmas.float() + self.ddim_alphas = self.ddim_alphas.float() + self.ddim_alphas_prev = self.ddim_alphas_prev.float() + self.ddim_sqrt_one_minus_alphas = torch.sqrt( + 1. - self.ddim_alphas).float() + + @torch.no_grad() + def denoise_apply_impl(self, + x_target_noisy, + index, + noise_pred, + is_step0=False): + """ + @param x_target_noisy: B,N,4,H,W + @param index: index + @param noise_pred: B,N,4,H,W + @param is_step0: bool + @return: + """ + device = x_target_noisy.device + B, N, _, H, W = x_target_noisy.shape + + # apply noise + a_t = self.ddim_alphas[index].to(device).float().view(1, 1, 1, 1, 1) + a_prev = self.ddim_alphas_prev[index].to(device).float().view( + 1, 1, 1, 1, 1) + sqrt_one_minus_at = self.ddim_sqrt_one_minus_alphas[index].to( + device).float().view(1, 1, 1, 1, 1) + sigma_t = self.ddim_sigmas[index].to(device).float().view( + 1, 1, 1, 1, 1) + + pred_x0 = (x_target_noisy + - sqrt_one_minus_at * noise_pred) / a_t.sqrt() + dir_xt = torch.clamp( + 1. - a_prev - sigma_t**2, min=1e-7).sqrt() * noise_pred + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + if not is_step0: + noise = sigma_t * torch.randn_like(x_target_noisy) + x_prev = x_prev + noise + return x_prev + + @torch.no_grad() + def denoise_apply(self, + x_target_noisy, + input_info, + clip_embed, + time_steps, + index, + unconditional_scale, + batch_view_num=1, + is_step0=False): + """ + @param x_target_noisy: B,N,4,H,W + @param input_info: + @param clip_embed: B,M,768 + @param time_steps: B, + @param index: int + @param unconditional_scale: + @param batch_view_num: int + @param is_step0: bool + @return: + """ + x_input, elevation_input = input_info['x'], input_info['elevation'] + B, N, C, H, W = x_target_noisy.shape + + # construct source data + v_embed = self.model.get_viewpoint_embedding( + B, elevation_input) # B,N,v_dim + t_embed = self.model.embed_time(time_steps) # B,t_dim + spatial_volume = self.model.spatial_volume.construct_spatial_volume( + x_target_noisy, t_embed, v_embed, self.model.poses, self.model.Ks) + + e_t = [] + target_indices = torch.arange(N) # N + for ni in range(0, N, batch_view_num): + x_target_noisy_ = x_target_noisy[:, ni:ni + batch_view_num] + VN = x_target_noisy_.shape[1] + x_target_noisy_ = x_target_noisy_.reshape(B * VN, C, H, W) + + time_steps_ = repeat_to_batch(time_steps, B, VN) + target_indices_ = target_indices[ni:ni + batch_view_num].unsqueeze( + 0).repeat(B, 1) + clip_embed_, volume_feats_, x_concat_ = self.model.get_target_view_feats( + x_input, spatial_volume, clip_embed, t_embed, v_embed, + target_indices_) + if unconditional_scale != 1.0: + noise = self.model.model.predict_with_unconditional_scale( + x_target_noisy_, time_steps_, clip_embed_, volume_feats_, + x_concat_, unconditional_scale) + else: + noise = self.model.model( + x_target_noisy_, + time_steps_, + clip_embed_, + volume_feats_, + x_concat_, + is_train=False) + e_t.append(noise.view(B, VN, 4, H, W)) + + e_t = torch.cat(e_t, 1) + x_prev = self.denoise_apply_impl(x_target_noisy, index, e_t, is_step0) + return x_prev + + @torch.no_grad() + def sample(self, + input_info, + clip_embed, + unconditional_scale=1.0, + log_every_t=50, + batch_view_num=1): + """ + @param input_info: x, elevation + @param clip_embed: B,M,768 + @param unconditional_scale: + @param log_every_t: + @param batch_view_num: + @return: + """ + print(f'unconditional scale {unconditional_scale:.1f}') + C, H, W = 4, self.latent_size, self.latent_size + B = clip_embed.shape[0] + N = self.model.view_num + device = self.model.device + x_target_noisy = torch.randn([B, N, C, H, W], device=device) + + timesteps = self.ddim_timesteps + intermediates = {'x_inter': []} + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + + iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) + for i, step in enumerate(iterator): + index = total_steps - i - 1 # index in ddim state + time_steps = torch.full((B, ), + step, + device=device, + dtype=torch.long) + x_target_noisy = self.denoise_apply( + x_target_noisy, + input_info, + clip_embed, + time_steps, + index, + unconditional_scale, + batch_view_num=batch_view_num, + is_step0=index == 0) + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(x_target_noisy) + + return x_target_noisy, intermediates diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py new file mode 100644 index 000000000..2457746e1 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_attention.py @@ -0,0 +1,195 @@ +import torch +import torch.nn as nn + +from modelscope.models.cv.image_to_3d.ldm.modules.attention import ( # no qa + checkpoint, default, zero_module) +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.openaimodel import \ + UNetModel +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import \ + timestep_embedding + + +class DepthAttention(nn.Module): + + def __init__(self, + query_dim, + context_dim, + heads, + dim_head, + output_bias=True): + super().__init__() + inner_dim = dim_head * heads + context_dim = attention.default(context_dim, query_dim) + + self.scale = dim_head**-0.5 + self.heads = heads + self.dim_head = dim_head + + self.to_q = nn.Conv2d(query_dim, inner_dim, 1, 1, bias=False) + self.to_k = nn.Conv3d(context_dim, inner_dim, 1, 1, bias=False) + self.to_v = nn.Conv3d(context_dim, inner_dim, 1, 1, bias=False) + if output_bias: + self.to_out = nn.Conv2d(inner_dim, query_dim, 1, 1) + else: + self.to_out = nn.Conv2d(inner_dim, query_dim, 1, 1, bias=False) + + def forward(self, x, context): + """ + + @param x: b,f0,h,w + @param context: b,f1,d,h,w + @return: + """ + hn, hd = self.heads, self.dim_head + b, _, h, w = x.shape + b, _, d, h, w = context.shape + + q = self.to_q(x).reshape(b, hn, hd, h, w) # b,t,h,w + k = self.to_k(context).reshape(b, hn, hd, d, h, w) # b,t,d,h,w + v = self.to_v(context).reshape(b, hn, hd, d, h, w) # b,t,d,h,w + + sim = torch.sum(q.unsqueeze(3) * k, 2) * self.scale # b,hn,d,h,w + attn = sim.softmax(dim=2) + + # b,hn,hd,d,h,w * b,hn,1,d,h,w + out = torch.sum(v * attn.unsqueeze(2), 3) # b,hn,hd,h,w + out = out.reshape(b, hn * hd, h, w) + return self.to_out(out) + + +class DepthTransformer(nn.Module): + + def __init__(self, + dim, + n_heads, + d_head, + context_dim=None, + checkpoint=True): + super().__init__() + inner_dim = n_heads * d_head + self.proj_in = nn.Sequential( + nn.Conv2d(dim, inner_dim, 1, 1), + nn.GroupNorm(8, inner_dim), + nn.SiLU(True), + ) + self.proj_context = nn.Sequential( + nn.Conv3d(context_dim, context_dim, 1, 1, bias=False), # no bias + nn.GroupNorm(8, context_dim), + nn.ReLU( + True), # only relu, because we want input is 0, output is 0 + ) + self.depth_attn = DepthAttention( + query_dim=inner_dim, + heads=n_heads, + dim_head=d_head, + context_dim=context_dim, + output_bias=False + ) # is a self-attention if not self.disable_self_attn + self.proj_out = nn.Sequential( + nn.GroupNorm(8, inner_dim), + nn.ReLU(True), + nn.Conv2d(inner_dim, inner_dim, 3, 1, 1, bias=False), + nn.GroupNorm(8, inner_dim), + nn.ReLU(True), + attention.zero_module( + nn.Conv2d(inner_dim, dim, 3, 1, 1, bias=False)), + ) + self.checkpoint = attention.checkpoint + + def forward(self, x, context=None): + return checkpoint(self._forward, (x, context), self.parameters(), + self.checkpoint) + + def _forward(self, x, context): + x_in = x + x = self.proj_in(x) + context = self.proj_context(context) + x = self.depth_attn(x, context) + x = self.proj_out(x) + x_in + return x + + +class DepthWiseAttention(UNetModel): + + def __init__(self, volume_dims=(5, 16, 32, 64), *args, **kwargs): + super().__init__(*args, **kwargs) + # num_heads = 4 + model_channels = kwargs['model_channels'] + channel_mult = kwargs['channel_mult'] + d0, d1, d2, d3 = volume_dims + + # 4 + ch = model_channels * channel_mult[2] + self.middle_conditions = DepthTransformer( + ch, 4, d3 // 2, context_dim=d3) + + self.output_conditions = nn.ModuleList() + self.output_b2c = { + 3: 0, + 4: 1, + 5: 2, + 6: 3, + 7: 4, + 8: 5, + 9: 6, + 10: 7, + 11: 8 + } + # 8 + ch = model_channels * channel_mult[2] + self.output_conditions.append( + DepthTransformer(ch, 4, d2 // 2, context_dim=d2)) # 0 + self.output_conditions.append( + DepthTransformer(ch, 4, d2 // 2, context_dim=d2)) # 1 + # 16 + self.output_conditions.append( + DepthTransformer(ch, 4, d1 // 2, context_dim=d1)) # 2 + ch = model_channels * channel_mult[1] + self.output_conditions.append( + DepthTransformer(ch, 4, d1 // 2, context_dim=d1)) # 3 + self.output_conditions.append( + DepthTransformer(ch, 4, d1 // 2, context_dim=d1)) # 4 + # 32 + self.output_conditions.append( + DepthTransformer(ch, 4, d0 // 2, context_dim=d0)) # 5 + ch = model_channels * channel_mult[0] + self.output_conditions.append( + DepthTransformer(ch, 4, d0 // 2, context_dim=d0)) # 6 + self.output_conditions.append( + DepthTransformer(ch, 4, d0 // 2, context_dim=d0)) # 7 + self.output_conditions.append( + DepthTransformer(ch, 4, d0 // 2, context_dim=d0)) # 8 + + def forward(self, + x, + timesteps=None, + context=None, + source_dict=None, + **kwargs): + hs = [] + t_emb = timestep_embedding( + timesteps, self.model_channels, repeat_only=False) + emb = self.time_embed(t_emb) + + h = x.type(self.dtype) + for index, module in enumerate(self.input_blocks): + h = module(h, emb, context) + hs.append(h) + + h = self.middle_block(h, emb, context) + h = self.middle_conditions(h, context=source_dict[h.shape[-1]]) + + for index, module in enumerate(self.output_blocks): + h = torch.cat([h, hs.pop()], dim=1) + h = module(h, emb, context) + if index in self.output_b2c: + layer = self.output_conditions[self.output_b2c[index]] + h = layer(h, context=source_dict[h.shape[-1]]) + + h = h.type(x.dtype) + return self.out(h) + + def get_trainable_parameters(self): + paras = [para for para in self.middle_conditions.parameters() + ] + [para for para in self.output_conditions.parameters()] + return paras diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_network.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_network.py new file mode 100644 index 000000000..9b3d6616d --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_network.py @@ -0,0 +1,233 @@ +import torch +import torch.nn as nn + + +class Image2DResBlockWithTV(nn.Module): + + def __init__(self, dim, tdim, vdim): + super().__init__() + + def norm(c): + return nn.GroupNorm(8, c) + + self.time_embed = nn.Conv2d(tdim, dim, 1, 1) + self.view_embed = nn.Conv2d(vdim, dim, 1, 1) + self.conv = nn.Sequential( + norm(dim), + nn.SiLU(True), + nn.Conv2d(dim, dim, 3, 1, 1), + norm(dim), + nn.SiLU(True), + nn.Conv2d(dim, dim, 3, 1, 1), + ) + + def forward(self, x, t, v): + return x + self.conv(x + self.time_embed(t) + self.view_embed(v)) + + +class NoisyTargetViewEncoder(nn.Module): + + def __init__(self, + time_embed_dim, + viewpoint_dim, + run_dim=16, + output_dim=8): + super().__init__() + + self.init_conv = nn.Conv2d(4, run_dim, 3, 1, 1) + self.out_conv0 = Image2DResBlockWithTV(run_dim, time_embed_dim, + viewpoint_dim) + self.out_conv1 = Image2DResBlockWithTV(run_dim, time_embed_dim, + viewpoint_dim) + self.out_conv2 = Image2DResBlockWithTV(run_dim, time_embed_dim, + viewpoint_dim) + self.final_out = nn.Sequential( + nn.GroupNorm(8, run_dim), nn.SiLU(True), + nn.Conv2d(run_dim, output_dim, 3, 1, 1)) + + def forward(self, x, t, v): + B, DT = t.shape + t = t.view(B, DT, 1, 1) + B, DV = v.shape + v = v.view(B, DV, 1, 1) + + x = self.init_conv(x) + x = self.out_conv0(x, t, v) + x = self.out_conv1(x, t, v) + x = self.out_conv2(x, t, v) + x = self.final_out(x) + return x + + +class SpatialUpTimeBlock(nn.Module): + + def __init__(self, x_in_dim, t_in_dim, out_dim): + super().__init__() + + def norm_act(c): + return nn.GroupNorm(8, c) + + self.t_conv = nn.Conv3d(t_in_dim, x_in_dim, 1, 1) # 16 + self.norm = norm_act(x_in_dim) + self.silu = nn.SiLU(True) + self.conv = nn.ConvTranspose3d( + x_in_dim, + out_dim, + kernel_size=3, + padding=1, + output_padding=1, + stride=2) + + def forward(self, x, t): + x = x + self.t_conv(t) + return self.conv(self.silu(self.norm(x))) + + +class SpatialTimeBlock(nn.Module): + + def __init__(self, x_in_dim, t_in_dim, out_dim, stride): + super().__init__() + + def norm_act(c): + return nn.GroupNorm(8, c) + + self.t_conv = nn.Conv3d(t_in_dim, x_in_dim, 1, 1) # 16 + self.bn = norm_act(x_in_dim) + self.silu = nn.SiLU(True) + self.conv = nn.Conv3d(x_in_dim, out_dim, 3, stride=stride, padding=1) + + def forward(self, x, t): + x = x + self.t_conv(t) + return self.conv(self.silu(self.bn(x))) + + +class SpatialTime3DNet(nn.Module): + + def __init__(self, time_dim=256, input_dim=128, dims=(32, 64, 128, 256)): + super().__init__() + d0, d1, d2, d3 = dims + dt = time_dim + + self.init_conv = nn.Conv3d(input_dim, d0, 3, 1, 1) # 32 + self.conv0 = SpatialTimeBlock(d0, dt, d0, stride=1) + + self.conv1 = SpatialTimeBlock(d0, dt, d1, stride=2) + self.conv2_0 = SpatialTimeBlock(d1, dt, d1, stride=1) + self.conv2_1 = SpatialTimeBlock(d1, dt, d1, stride=1) + + self.conv3 = SpatialTimeBlock(d1, dt, d2, stride=2) + self.conv4_0 = SpatialTimeBlock(d2, dt, d2, stride=1) + self.conv4_1 = SpatialTimeBlock(d2, dt, d2, stride=1) + + self.conv5 = SpatialTimeBlock(d2, dt, d3, stride=2) + self.conv6_0 = SpatialTimeBlock(d3, dt, d3, stride=1) + self.conv6_1 = SpatialTimeBlock(d3, dt, d3, stride=1) + + self.conv7 = SpatialUpTimeBlock(d3, dt, d2) + self.conv8 = SpatialUpTimeBlock(d2, dt, d1) + self.conv9 = SpatialUpTimeBlock(d1, dt, d0) + + def forward(self, x, t): + B, C = t.shape + t = t.view(B, C, 1, 1, 1) + + x = self.init_conv(x) + conv0 = self.conv0(x, t) + + x = self.conv1(conv0, t) + x = self.conv2_0(x, t) + conv2 = self.conv2_1(x, t) + + x = self.conv3(conv2, t) + x = self.conv4_0(x, t) + conv4 = self.conv4_1(x, t) + + x = self.conv5(conv4, t) + x = self.conv6_0(x, t) + x = self.conv6_1(x, t) + + x = conv4 + self.conv7(x, t) + x = conv2 + self.conv8(x, t) + x = conv0 + self.conv9(x, t) + return x + + +class FrustumTVBlock(nn.Module): + + def __init__(self, x_dim, t_dim, v_dim, out_dim, stride): + super().__init__() + + def norm_act(c): + return nn.GroupNorm(8, c) + + self.t_conv = nn.Conv3d(t_dim, x_dim, 1, 1) # 16 + self.v_conv = nn.Conv3d(v_dim, x_dim, 1, 1) # 16 + self.bn = norm_act(x_dim) + self.silu = nn.SiLU(True) + self.conv = nn.Conv3d(x_dim, out_dim, 3, stride=stride, padding=1) + + def forward(self, x, t, v): + x = x + self.t_conv(t) + self.v_conv(v) + return self.conv(self.silu(self.bn(x))) + + +class FrustumTVUpBlock(nn.Module): + + def __init__(self, x_dim, t_dim, v_dim, out_dim): + super().__init__() + + def norm_act(c): + return nn.GroupNorm(8, c) + + self.t_conv = nn.Conv3d(t_dim, x_dim, 1, 1) # 16 + self.v_conv = nn.Conv3d(v_dim, x_dim, 1, 1) # 16 + self.norm = norm_act(x_dim) + self.silu = nn.SiLU(True) + self.conv = nn.ConvTranspose3d( + x_dim, + out_dim, + kernel_size=3, + padding=1, + output_padding=1, + stride=2) + + def forward(self, x, t, v): + x = x + self.t_conv(t) + self.v_conv(v) + return self.conv(self.silu(self.norm(x))) + + +class FrustumTV3DNet(nn.Module): + + def __init__(self, in_dim, t_dim, v_dim, dims=(32, 64, 128, 256)): + super().__init__() + self.conv0 = nn.Conv3d(in_dim, dims[0], 3, 1, 1) # 32 + + self.conv1 = FrustumTVBlock(dims[0], t_dim, v_dim, dims[1], 2) + self.conv2 = FrustumTVBlock(dims[1], t_dim, v_dim, dims[1], 1) + + self.conv3 = FrustumTVBlock(dims[1], t_dim, v_dim, dims[2], 2) + self.conv4 = FrustumTVBlock(dims[2], t_dim, v_dim, dims[2], 1) + + self.conv5 = FrustumTVBlock(dims[2], t_dim, v_dim, dims[3], 2) + self.conv6 = FrustumTVBlock(dims[3], t_dim, v_dim, dims[3], 1) + + self.up0 = FrustumTVUpBlock(dims[3], t_dim, v_dim, dims[2]) + self.up1 = FrustumTVUpBlock(dims[2], t_dim, v_dim, dims[1]) + self.up2 = FrustumTVUpBlock(dims[1], t_dim, v_dim, dims[0]) + + def forward(self, x, t, v): + B, DT = t.shape + t = t.view(B, DT, 1, 1, 1) + B, DV = v.shape + v = v.view(B, DV, 1, 1, 1) + + b, _, d, h, w = x.shape + x0 = self.conv0(x) + x1 = self.conv2(self.conv1(x0, t, v), t, v) + x2 = self.conv4(self.conv3(x1, t, v), t, v) + x3 = self.conv6(self.conv5(x2, t, v), t, v) + + x2 = self.up0(x3, t, v) + x2 + x1 = self.up1(x2, t, v) + x1 + x0 = self.up2(x1, t, v) + x0 + return {w: x0, w // 2: x1, w // 4: x2, w // 8: x3} diff --git a/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_utils.py b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_utils.py new file mode 100644 index 000000000..e7f2921ff --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/models/diffusion/sync_dreamer_utils.py @@ -0,0 +1,130 @@ +import torch +from kornia import create_meshgrid + + +def project_and_normalize(ref_grid, src_proj, length): + """ + + @param ref_grid: b 3 n + @param src_proj: b 4 4 + @param length: int + @return: b, n, 2 + """ + src_grid = src_proj[:, :3, :3] @ ref_grid + src_proj[:, :3, 3:] # b 3 n + div_val = src_grid[:, -1:] + div_val[div_val < 1e-4] = 1e-4 + src_grid = src_grid[:, :2] / div_val # divide by depth (b, 2, n) + src_grid[:, 0] = src_grid[:, 0] / ((length - 1) / 2) - 1 # scale to -1~1 + src_grid[:, 1] = src_grid[:, 1] / ((length - 1) / 2) - 1 # scale to -1~1 + src_grid = src_grid.permute(0, 2, 1) # (b, n, 2) + return src_grid + + +def construct_project_matrix(x_ratio, y_ratio, Ks, poses): + """ + @param x_ratio: float + @param y_ratio: float + @param Ks: b,3,3 + @param poses: b,3,4 + @return: + """ + rfn = Ks.shape[0] + scale_m = torch.tensor([x_ratio, y_ratio, 1.0], + dtype=torch.float32, + device=Ks.device) + scale_m = torch.diag(scale_m) + ref_prj = scale_m[None, :, :] @ Ks @ poses # rfn,3,4 + pad_vals = torch.zeros([rfn, 1, 4], + dtype=torch.float32, + device=ref_prj.device) + pad_vals[:, :, 3] = 1.0 + ref_prj = torch.cat([ref_prj, pad_vals], 1) # rfn,4,4 + return ref_prj + + +def get_warp_coordinates(volume_xyz, warp_size, input_size, Ks, warp_pose): + B, _, D, H, W = volume_xyz.shape + ratio = warp_size / input_size + warp_proj = construct_project_matrix(ratio, ratio, Ks, warp_pose) # B,4,4 + warp_coords = project_and_normalize( + volume_xyz.view(B, 3, D * H * W), warp_proj, + warp_size).view(B, D, H, W, 2) + return warp_coords + + +def create_target_volume(depth_size, + volume_size, + input_image_size, + pose_target, + K, + near=None, + far=None): + device, dtype = pose_target.device, pose_target.dtype + + # compute a depth range on the unit sphere + H, W, D, B = volume_size, volume_size, depth_size, pose_target.shape[0] + if near is not None and far is not None: + # near, far b,1,h,w + depth_values = torch.linspace( + 0, 1, steps=depth_size).to(near.device).to(near.dtype) # d + depth_values = depth_values.view(1, D, 1, 1) # 1,d,1,1 + depth_values = depth_values * (far - near) + near # b d h w + depth_values = depth_values.view(B, 1, D, H * W) + else: + near, far = near_far_from_unit_sphere_using_camera_poses( + pose_target) # b 1 + depth_values = torch.linspace( + 0, 1, steps=depth_size).to(near.device).to(near.dtype) # d + depth_values = depth_values[None, :, None] * ( + far[:, None, :] - near[:, None, :]) + near[:, None, :] # b d 1 + depth_values = depth_values.view(B, 1, D, 1).expand(B, 1, D, H * W) + + ratio = volume_size / input_image_size + + # creat a grid on the target (reference) view + # H, W, D, B = volume_size, volume_size, depth_values.shape[1], depth_values.shape[0] + + # creat mesh grid: note reference also means target + ref_grid = create_meshgrid( + H, W, normalized_coordinates=False) # (1, H, W, 2) + ref_grid = ref_grid.to(device).to(dtype) + ref_grid = ref_grid.permute(0, 3, 1, 2) # (1, 2, H, W) + ref_grid = ref_grid.reshape(1, 2, H * W) # (1, 2, H*W) + ref_grid = ref_grid.expand(B, -1, -1) # (B, 2, H*W) + ref_grid = torch.cat( + (ref_grid, + torch.ones(B, 1, H * W, dtype=ref_grid.dtype, + device=ref_grid.device)), + dim=1) # (B, 3, H*W) + ref_grid = ref_grid.unsqueeze(2) * depth_values # (B, 3, D, H*W) + + # unproject to space and transfer to world coordinates. + Ks = K + ref_proj = construct_project_matrix(ratio, ratio, Ks, pose_target) # B,4,4 + ref_proj_inv = torch.inverse(ref_proj) # B,4,4 + ref_grid = ref_proj_inv[:, :3, :3] @ ref_grid.view( + B, 3, D * H + * W) + ref_proj_inv[:, :3, 3:] # B,3,3 @ B,3,DHW + B,3,1 => B,3,DHW + return ref_grid.reshape(B, 3, D, H, W), depth_values.view(B, 1, D, H, W) + + +def near_far_from_unit_sphere_using_camera_poses(camera_poses): + """ + @param camera_poses: b 3 4 + @return: + near: b,1 + far: b,1 + """ + R_w2c = camera_poses[..., :3, :3] # b 3 3 + t_w2c = camera_poses[..., :3, 3:] # b 3 1 + camera_origin = -R_w2c.permute(0, 2, 1) @ t_w2c # b 3 1 + # R_w2c.T @ (0,0,1) = z_dir + camera_orient = R_w2c.permute(0, 2, 1)[..., :3, 2:3] # b 3 1 + camera_origin, camera_orient = camera_origin[..., + 0], camera_orient[..., + 0] # b 3 + a = torch.sum(camera_orient**2, dim=-1, keepdim=True) # b 1 + b = -torch.sum(camera_orient * camera_origin, dim=-1, keepdim=True) # b 1 + mid = b / a # b 1 + near, far = mid - 1.0, mid + 1.0 + return near, far diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/attention.py b/modelscope/models/cv/image_to_3d/ldm/modules/attention.py new file mode 100644 index 000000000..aeab0a064 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/attention.py @@ -0,0 +1,382 @@ +import math +from inspect import isfunction + +import torch +import torch.nn.functional as F +from einops import rearrange, repeat +from torch import einsum, nn + +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import \ + checkpoint + + +def exists(val): + return val is not None + + +def uniq(arr): + return {el: True for el in arr}.keys() + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + + +def init_(tensor): + dim = tensor.shape[-1] + std = 1 / math.sqrt(dim) + tensor.uniform_(-std, std) + return tensor + + +# feedforward +class GEGLU(nn.Module): + + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * F.gelu(gate) + + +# feedforward +class ConvGEGLU(nn.Module): + + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Conv2d(dim_in, dim_out * 2, 1, 1, 0) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = nn.Sequential(nn.Linear( + dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim) + + self.net = nn.Sequential(project_in, nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out)) + + def forward(self, x): + return self.net(x) + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def Normalize(in_channels): + return torch.nn.GroupNorm( + num_groups=32, num_channels=in_channels, eps=1e-6, affine=True) + + +class LinearAttention(nn.Module): + + def __init__(self, dim, heads=4, dim_head=32): + super().__init__() + self.heads = heads + hidden_dim = dim_head * heads + self.to_qkv = nn.Conv2d(dim, hidden_dim * 3, 1, bias=False) + self.to_out = nn.Conv2d(hidden_dim, dim, 1) + + def forward(self, x): + b, c, h, w = x.shape + qkv = self.to_qkv(x) + q, k, v = rearrange( + qkv, + 'b (qkv heads c) h w -> qkv b heads c (h w)', + heads=self.heads, + qkv=3) + k = k.softmax(dim=-1) + context = torch.einsum('bhdn,bhen->bhde', k, v) + out = torch.einsum('bhde,bhdn->bhen', context, q) + out = rearrange( + out, + 'b heads c (h w) -> b (heads c) h w', + heads=self.heads, + h=h, + w=w) + return self.to_out(out) + + +class SpatialSelfAttention(nn.Module): + + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = rearrange(q, 'b c h w -> b (h w) c') + k = rearrange(k, 'b c h w -> b c (h w)') + w_ = torch.einsum('bij,bjk->bik', q, k) + + w_ = w_ * (int(c)**(-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = rearrange(v, 'b c h w -> b c (h w)') + w_ = rearrange(w_, 'b i j -> b j i') + h_ = torch.einsum('bij,bjk->bik', v, w_) + h_ = rearrange(h_, 'b c (h w) -> b c h w', h=h) + h_ = self.proj_out(h_) + + return x + h_ + + +class CrossAttention(nn.Module): + + def __init__(self, + query_dim, + context_dim=None, + heads=8, + dim_head=64, + dropout=0.): + super().__init__() + inner_dim = dim_head * heads + context_dim = default(context_dim, query_dim) + + self.scale = dim_head**-0.5 + self.heads = heads + + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Sequential( + nn.Linear(inner_dim, query_dim), nn.Dropout(dropout)) + + def forward(self, x, context=None, mask=None): + h = self.heads + + q = self.to_q(x) + context = default(context, x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), + (q, k, v)) + + sim = einsum('b i d, b j d -> b i j', q, k) * self.scale + + if exists(mask): + mask = mask > 0 + mask = rearrange(mask, 'b ... -> b (...)') + max_neg_value = -torch.finfo(sim.dtype).max + mask = repeat(mask, 'b j -> (b h) () j', h=h) + sim.masked_fill_(~mask, max_neg_value) + + # attention, what we cannot get enough of + attn = sim.softmax(dim=-1) + + out = einsum('b i j, b j d -> b i d', attn, v) + out = rearrange(out, '(b h) n d -> b n (h d)', h=h) + return self.to_out(out) + + +class BasicSpatialTransformer(nn.Module): + + def __init__(self, + dim, + n_heads, + d_head, + context_dim=None, + checkpoint=True): + super().__init__() + inner_dim = n_heads * d_head + self.proj_in = nn.Sequential( + nn.GroupNorm(8, dim), + nn.Conv2d(dim, inner_dim, kernel_size=1, stride=1, padding=0), + nn.GroupNorm(8, inner_dim), + nn.ReLU(True), + ) + self.attn = CrossAttention( + query_dim=inner_dim, + heads=n_heads, + dim_head=d_head, + context_dim=context_dim + ) # is a self-attention if not self.disable_self_attn + self.out_conv = nn.Sequential( + nn.GroupNorm(8, inner_dim), + nn.ReLU(True), + nn.Conv2d(inner_dim, inner_dim, 1, 1), + ) + self.proj_out = nn.Sequential( + nn.GroupNorm(8, inner_dim), + nn.ReLU(True), + zero_module( + nn.Conv2d(inner_dim, dim, kernel_size=1, stride=1, padding=0)), + ) + self.checkpoint = checkpoint + + def forward(self, x, context=None): + return checkpoint(self._forward, (x, context), self.parameters(), + self.checkpoint) + + def _forward(self, x, context): + # input + b, _, h, w = x.shape + x_in = x + x = self.proj_in(x) + + # attention + x = rearrange(x, 'b c h w -> b (h w) c').contiguous() + context = rearrange(context, 'b c h w -> b (h w) c').contiguous() + x = self.attn(x, context) + x + x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous() + + # output + x = self.out_conv(x) + x + x = self.proj_out(x) + x_in + return x + + +class BasicTransformerBlock(nn.Module): + + def __init__(self, + dim, + n_heads, + d_head, + dropout=0., + context_dim=None, + gated_ff=True, + checkpoint=True, + disable_self_attn=False): + super().__init__() + self.disable_self_attn = disable_self_attn + self.attn1 = CrossAttention( + query_dim=dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout, + context_dim=context_dim if self.disable_self_attn else + None) # is a self-attention if not self.disable_self_attn + self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) + self.attn2 = CrossAttention( + query_dim=dim, + context_dim=context_dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout) # is self-attn if context is none + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + self.norm3 = nn.LayerNorm(dim) + self.checkpoint = checkpoint + + def forward(self, x, context=None): + return checkpoint(self._forward, (x, context), self.parameters(), + self.checkpoint) + + def _forward(self, x, context=None): + x = self.attn1( + self.norm1(x), + context=context if self.disable_self_attn else None) + x + x = self.attn2(self.norm2(x), context=context) + x + x = self.ff(self.norm3(x)) + x + return x + + +class ConvFeedForward(nn.Module): + + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = nn.Sequential( + nn.Conv2d(dim, inner_dim, 1, 1, 0), + nn.GELU()) if not glu else ConvGEGLU(dim, inner_dim) + + self.net = nn.Sequential(project_in, nn.Dropout(dropout), + nn.Conv2d(inner_dim, dim_out, 1, 1, 0)) + + def forward(self, x): + return self.net(x) + + +class SpatialTransformer(nn.Module): + """ + Transformer block for image-like data. + First, project the input (aka embedding) + and reshape to b, t, d. + Then apply standard transformer action. + Finally, reshape to image + """ + + def __init__(self, + in_channels, + n_heads, + d_head, + depth=1, + dropout=0., + context_dim=None, + disable_self_attn=False): + super().__init__() + self.in_channels = in_channels + inner_dim = n_heads * d_head + self.norm = Normalize(in_channels) + + self.proj_in = nn.Conv2d( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0) + + self.transformer_blocks = nn.ModuleList([ + BasicTransformerBlock( + inner_dim, + n_heads, + d_head, + dropout=dropout, + context_dim=context_dim, + disable_self_attn=disable_self_attn) for d in range(depth) + ]) + + self.proj_out = zero_module( + nn.Conv2d( + inner_dim, in_channels, kernel_size=1, stride=1, padding=0)) + + def forward(self, x, context=None): + # note: if no context is given, cross-attention defaults to self-attention + b, c, h, w = x.shape + x_in = x + x = self.norm(x) + x = self.proj_in(x) + x = rearrange(x, 'b c h w -> b (h w) c').contiguous() + for block in self.transformer_blocks: + x = block(x, context=context) + x = rearrange(x, 'b (h w) c -> b c h w', h=h, w=w).contiguous() + x = self.proj_out(x) + return x + x_in diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/model.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/model.py new file mode 100644 index 000000000..83780c98e --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/model.py @@ -0,0 +1,964 @@ +# pytorch_diffusion + derived encoder decoder +import math + +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange + +from modelscope.models.cv.image_to_3d.ldm.modules.attention import \ + LinearAttention +from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config + + +def get_timestep_embedding(timesteps, embedding_dim): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: + From Fairseq. + Build sinusoidal embeddings. + This matches the implementation in tensor2tensor, but differs slightly + from the description in Section 3.5 of "Attention Is All You Need". + """ + assert len(timesteps.shape) == 1 + + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb) + emb = emb.to(device=timesteps.device) + emb = timesteps.float()[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) + return emb + + +def nonlinearity(x): + # swish + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32): + return torch.nn.GroupNorm( + num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True) + + +class Upsample(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + x = torch.nn.functional.interpolate( + x, scale_factor=2.0, mode='nearest') + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=2, padding=0) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode='constant', value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class ResnetBlock(nn.Module): + + def __init__(self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels) + self.conv1 = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=1) + else: + self.nin_shortcut = torch.nn.Conv2d( + in_channels, + out_channels, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class LinAttnBlock(LinearAttention): + """to match AttnBlock usage""" + + def __init__(self, in_channels): + super().__init__(dim=in_channels, heads=1, dim_head=in_channels) + + +class AttnBlock(nn.Module): + + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = q.reshape(b, c, h * w) + q = q.permute(0, 2, 1) # b,hw,c + k = k.reshape(b, c, h * w) # b,c,hw + w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j] + w_ = w_ * (int(c)**(-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = v.reshape(b, c, h * w) + w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q) + h_ = torch.bmm( + v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j] + h_ = h_.reshape(b, c, h, w) + + h_ = self.proj_out(h_) + + return x + h_ + + +def make_attn(in_channels, attn_type='vanilla'): + assert attn_type in ['vanilla', 'linear', + 'none'], f'attn_type {attn_type} unknown' + print( + f"making attention of type '{attn_type}' with {in_channels} in_channels" + ) + if attn_type == 'vanilla': + return AttnBlock(in_channels) + elif attn_type == 'none': + return nn.Identity(in_channels) + else: + return LinAttnBlock(in_channels) + + +class Model(nn.Module): + + def __init__(self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + use_timestep=True, + use_linear_attn=False, + attn_type='vanilla'): + super().__init__() + if use_linear_attn: + attn_type = 'linear' + self.ch = ch + self.temb_ch = self.ch * 4 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + self.use_timestep = use_timestep + if self.use_timestep: + # timestep embedding + self.temb = nn.Module() + self.temb.dense = nn.ModuleList([ + torch.nn.Linear(self.ch, self.temb_ch), + torch.nn.Linear(self.temb_ch, self.temb_ch), + ]) + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_ch_mult = (1, ) + tuple(ch_mult) + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + skip_in = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + if i_block == self.num_res_blocks: + skip_in = ch * in_ch_mult[i_level] + block.append( + ResnetBlock( + in_channels=block_in + skip_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1) + + def forward(self, x, t=None, context=None): + # assert x.shape[2] == x.shape[3] == self.resolution + if context is not None: + # assume aligned context, cat along channel axis + x = torch.cat((x, context), dim=1) + if self.use_timestep: + # timestep embedding + assert t is not None + temb = get_timestep_embedding(t, self.ch) + temb = self.temb.dense[0](temb) + temb = nonlinearity(temb) + temb = self.temb.dense[1](temb) + else: + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](torch.cat([h, hs.pop()], + dim=1), temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + def get_last_layer(self): + return self.conv_out.weight + + +class Encoder(nn.Module): + + def __init__(self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + z_channels, + double_z=True, + use_linear_attn=False, + attn_type='vanilla', + **ignore_kwargs): + super().__init__() + if use_linear_attn: + attn_type = 'linear' + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1) + + curr_res = resolution + in_ch_mult = (1, ) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1) + + def forward(self, x): + # timestep embedding + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder(nn.Module): + + def __init__(self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + z_channels, + give_pre_end=False, + tanh_out=False, + use_linear_attn=False, + attn_type='vanilla', + **ignorekwargs): + super().__init__() + if use_linear_attn: + attn_type = 'linear' + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + + # compute in_ch_mult, block_in and curr_res at lowest res + # in_ch_mult = (1, ) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2**(self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print('Working with z of shape {} = {} dimensions.'.format( + self.z_shape, np.prod(self.z_shape))) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + if self.tanh_out: + h = torch.tanh(h) + return h + + +class SimpleDecoder(nn.Module): + + def __init__(self, in_channels, out_channels, *args, **kwargs): + super().__init__() + self.model = nn.ModuleList([ + nn.Conv2d(in_channels, in_channels, 1), + ResnetBlock( + in_channels=in_channels, + out_channels=2 * in_channels, + temb_channels=0, + dropout=0.0), + ResnetBlock( + in_channels=2 * in_channels, + out_channels=4 * in_channels, + temb_channels=0, + dropout=0.0), + ResnetBlock( + in_channels=4 * in_channels, + out_channels=2 * in_channels, + temb_channels=0, + dropout=0.0), + nn.Conv2d(2 * in_channels, in_channels, 1), + Upsample(in_channels, with_conv=True) + ]) + # end + self.norm_out = Normalize(in_channels) + self.conv_out = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + for i, layer in enumerate(self.model): + if i in [1, 2, 3]: + x = layer(x, None) + else: + x = layer(x) + + h = self.norm_out(x) + h = nonlinearity(h) + x = self.conv_out(h) + return x + + +class UpsampleDecoder(nn.Module): + + def __init__(self, + in_channels, + out_channels, + ch, + num_res_blocks, + resolution, + ch_mult=(2, 2), + dropout=0.0): + super().__init__() + # upsampling + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + block_in = in_channels + curr_res = resolution // 2**(self.num_resolutions - 1) + self.res_blocks = nn.ModuleList() + self.upsample_blocks = nn.ModuleList() + for i_level in range(self.num_resolutions): + res_block = [] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + res_block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout)) + block_in = block_out + self.res_blocks.append(nn.ModuleList(res_block)) + if i_level != self.num_resolutions - 1: + self.upsample_blocks.append(Upsample(block_in, True)) + curr_res = curr_res * 2 + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_channels, kernel_size=3, stride=1, padding=1) + + def forward(self, x): + # upsampling + h = x + for k, i_level in enumerate(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.res_blocks[i_level][i_block](h, None) + if i_level != self.num_resolutions - 1: + h = self.upsample_blocks[k](h) + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class LatentRescaler(nn.Module): + + def __init__(self, + factor, + in_channels, + mid_channels, + out_channels, + depth=2): + super().__init__() + # residual block, interpolate, residual block + self.factor = factor + self.conv_in = nn.Conv2d( + in_channels, mid_channels, kernel_size=3, stride=1, padding=1) + self.res_block1 = nn.ModuleList([ + ResnetBlock( + in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0) for _ in range(depth) + ]) + self.attn = AttnBlock(mid_channels) + self.res_block2 = nn.ModuleList([ + ResnetBlock( + in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0) for _ in range(depth) + ]) + + self.conv_out = nn.Conv2d( + mid_channels, + out_channels, + kernel_size=1, + ) + + def forward(self, x): + x = self.conv_in(x) + for block in self.res_block1: + x = block(x, None) + x = torch.nn.functional.interpolate( + x, + size=(int(round(x.shape[2] * self.factor)), + int(round(x.shape[3] * self.factor)))) + x = self.attn(x) + for block in self.res_block2: + x = block(x, None) + x = self.conv_out(x) + return x + + +class MergedRescaleEncoder(nn.Module): + + def __init__(self, + in_channels, + ch, + resolution, + out_ch, + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + ch_mult=(1, 2, 4, 8), + rescale_factor=1.0, + rescale_module_depth=1): + super().__init__() + intermediate_chn = ch * ch_mult[-1] + self.encoder = Encoder( + in_channels=in_channels, + num_res_blocks=num_res_blocks, + ch=ch, + ch_mult=ch_mult, + z_channels=intermediate_chn, + double_z=False, + resolution=resolution, + attn_resolutions=attn_resolutions, + dropout=dropout, + resamp_with_conv=resamp_with_conv, + out_ch=None) + self.rescaler = LatentRescaler( + factor=rescale_factor, + in_channels=intermediate_chn, + mid_channels=intermediate_chn, + out_channels=out_ch, + depth=rescale_module_depth) + + def forward(self, x): + x = self.encoder(x) + x = self.rescaler(x) + return x + + +class MergedRescaleDecoder(nn.Module): + + def __init__(self, + z_channels, + out_ch, + resolution, + num_res_blocks, + attn_resolutions, + ch, + ch_mult=(1, 2, 4, 8), + dropout=0.0, + resamp_with_conv=True, + rescale_factor=1.0, + rescale_module_depth=1): + super().__init__() + tmp_chn = z_channels * ch_mult[-1] + self.decoder = Decoder( + out_ch=out_ch, + z_channels=tmp_chn, + attn_resolutions=attn_resolutions, + dropout=dropout, + resamp_with_conv=resamp_with_conv, + in_channels=None, + num_res_blocks=num_res_blocks, + ch_mult=ch_mult, + resolution=resolution, + ch=ch) + self.rescaler = LatentRescaler( + factor=rescale_factor, + in_channels=z_channels, + mid_channels=tmp_chn, + out_channels=tmp_chn, + depth=rescale_module_depth) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Upsampler(nn.Module): + + def __init__(self, + in_size, + out_size, + in_channels, + out_channels, + ch_mult=2): + super().__init__() + assert out_size >= in_size + num_blocks = int(np.log2(out_size // in_size)) + 1 + factor_up = 1. + (out_size % in_size) + print( + f'Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}' + ) + self.rescaler = LatentRescaler( + factor=factor_up, + in_channels=in_channels, + mid_channels=2 * in_channels, + out_channels=in_channels) + self.decoder = Decoder( + out_ch=out_channels, + resolution=out_size, + z_channels=in_channels, + num_res_blocks=2, + attn_resolutions=[], + in_channels=None, + ch=in_channels, + ch_mult=[ch_mult for _ in range(num_blocks)]) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Resize(nn.Module): + + def __init__(self, in_channels=None, learned=False, mode='bilinear'): + super().__init__() + self.with_conv = learned + self.mode = mode + if self.with_conv: + print( + f'Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode' + ) + raise NotImplementedError() + assert in_channels is not None + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=4, stride=2, padding=1) + + def forward(self, x, scale_factor=1.0): + if scale_factor == 1.0: + return x + else: + x = torch.nn.functional.interpolate( + x, + mode=self.mode, + align_corners=False, + scale_factor=scale_factor) + return x + + +class FirstStagePostProcessor(nn.Module): + + def __init__(self, + ch_mult: list, + in_channels, + pretrained_model: nn.Module = None, + reshape=False, + n_channels=None, + dropout=0., + pretrained_config=None): + super().__init__() + if pretrained_config is None: + assert pretrained_model is not None, 'Either "pretrained_model" or "pretrained_config" must not be None' + self.pretrained_model = pretrained_model + else: + assert pretrained_config is not None, 'Either "pretrained_model" or "pretrained_config" must not be None' + self.instantiate_pretrained(pretrained_config) + + self.do_reshape = reshape + + if n_channels is None: + n_channels = self.pretrained_model.encoder.ch + + self.proj_norm = Normalize(in_channels, num_groups=in_channels // 2) + self.proj = nn.Conv2d( + in_channels, n_channels, kernel_size=3, stride=1, padding=1) + + blocks = [] + downs = [] + ch_in = n_channels + for m in ch_mult: + blocks.append( + ResnetBlock( + in_channels=ch_in, + out_channels=m * n_channels, + dropout=dropout)) + ch_in = m * n_channels + downs.append(Downsample(ch_in, with_conv=False)) + + self.model = nn.ModuleList(blocks) + self.downsampler = nn.ModuleList(downs) + + def instantiate_pretrained(self, config): + model = instantiate_from_config(config) + self.pretrained_model = model.eval() + # self.pretrained_model.train = False + for param in self.pretrained_model.parameters(): + param.requires_grad = False + + @torch.no_grad() + def encode_with_pretrained(self, x): + c = self.pretrained_model.encode(x) + if isinstance(c, DiagonalGaussianDistribution): + c = c.mode() + return c + + def forward(self, x): + z_fs = self.encode_with_pretrained(x) + z = self.proj_norm(z_fs) + z = self.proj(z) + z = nonlinearity(z) + + for submodel, downmodel in zip(self.model, self.downsampler): + z = submodel(z, temb=None) + z = downmodel(z) + + if self.do_reshape: + z = rearrange(z, 'b c h w -> b (h w) c') + return z diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/openaimodel.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/openaimodel.py new file mode 100644 index 000000000..5b6ac5fc8 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/openaimodel.py @@ -0,0 +1,1018 @@ +import math +from abc import abstractmethod +from functools import partial +from typing import Iterable + +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from modelscope.models.cv.image_to_3d.ldm.modules.attention import \ + SpatialTransformer +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import ( + avg_pool_nd, checkpoint, conv_nd, linear, normalization, + timestep_embedding, zero_module) +from modelscope.models.cv.image_to_3d.ldm.util import exists + + +# dummy replace +def convert_module_to_f16(x): + pass + + +def convert_module_to_f32(x): + pass + + +# go +class AttentionPool2d(nn.Module): + """ + Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py + """ + + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + self.positional_embedding = nn.Parameter( + th.randn(embed_dim, spacial_dim**2 + 1) / embed_dim**0.5) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + x = x.reshape(b, c, -1) # NC(HW) + x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) + x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] + + +class TimestepBlock(nn.Module): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb, context=None): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + elif isinstance(layer, SpatialTransformer): + x = layer(x, context) + else: + x = layer(x) + return x + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, + channels, + use_conv, + dims=2, + out_channels=None, + padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd( + dims, self.channels, self.out_channels, 3, padding=padding) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate( + x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), + mode='nearest') + else: + x = F.interpolate(x, scale_factor=2, mode='nearest') + if self.use_conv: + x = self.conv(x) + return x + + +class TransposedUpsample(nn.Module): + 'Learned 2x upsampling without padding' + + def __init__(self, channels, out_channels=None, ks=5): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + + self.up = nn.ConvTranspose2d( + self.channels, self.out_channels, kernel_size=ks, stride=2) + + def forward(self, x): + return self.up(x) + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, + channels, + use_conv, + dims=2, + out_channels=None, + padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, + self.channels, + self.out_channels, + 3, + stride=stride, + padding=padding) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear( + emb_channels, + 2 * self.out_channels + if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + conv_nd( + dims, self.out_channels, self.out_channels, 3, padding=1)), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd( + dims, channels, self.out_channels, 3, padding=1) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, + 1) + + def forward(self, x, emb): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + return checkpoint(self._forward, (x, emb), self.parameters(), + self.use_checkpoint) + + def _forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: # False + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = th.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f'q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}' + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return checkpoint( + self._forward, (x, ), self.parameters(), True + ) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!! + # return pt_checkpoint(self._forward, x) # pytorch + + def _forward(self, x): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial**2) * c + model.total_ops += th.DoubleTensor([matmul_ops]) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split( + ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + 'bct,bcs->bts', q * scale, + k * scale) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum('bts,bcs->bct', weight, v) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Module): + """ + A module which performs QKV attention and splits in a different order. + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + 'bct,bcs->bts', + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum('bts,bcs->bct', weight, + v.reshape(bs * self.n_heads, ch, length)) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class UNetModel(nn.Module): + """ + The full UNet model with attention and timestep embedding. + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially + increased efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + use_spatial_transformer=False, # custom transformer support + transformer_depth=1, # custom transformer support + context_dim=None, # custom transformer support + n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model + legacy=True, + disable_self_attentions=None, + num_attention_blocks=None): + super().__init__() + if use_spatial_transformer: + assert context_dim is not None, 'Fool!! You forgot to include the dimension of your \ + cross-attention conditioning...' + + if context_dim is not None: + assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your \ + cross-attention conditioning...' + + from omegaconf.listconfig import ListConfig + if type(context_dim) == ListConfig: + context_dim = list(context_dim) + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + if num_heads == -1: + assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' + + if num_head_channels == -1: + assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + if isinstance(num_res_blocks, int): + self.num_res_blocks = len(channel_mult) * [num_res_blocks] + else: + if len(num_res_blocks) != len(channel_mult): + raise ValueError( + 'provide num_res_blocks either as an int (globally constant) or ' + 'as a list/tuple (per-level) with the same length as channel_mult' + ) + self.num_res_blocks = num_res_blocks + # self.num_res_blocks = num_res_blocks + if disable_self_attentions is not None: + # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not + assert len(disable_self_attentions) == len(channel_mult) + if num_attention_blocks is not None: + assert len(num_attention_blocks) == len(self.num_res_blocks) + assert all( + map( + lambda i: self.num_res_blocks[i] >= num_attention_blocks[i + ], + range(len(num_attention_blocks)))) + print( + f'Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. ' + f'This option has LESS priority than attention_resolutions {attention_resolutions}, ' + f'i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, ' + f'attention will still not be set.' + ) # todo: convert to warning + + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.predict_codebook_ids = n_embed is not None + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + if self.num_classes is not None: + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1)) + ]) # 0 + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for nr in range(self.num_res_blocks[level]): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: # always True + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks + ) or nr < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disabled_sa)) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) if resblock_updown else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch)) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( # always uses a self-attn + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(self.num_res_blocks[level] + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim, + dropout, + out_channels=model_channels * mult, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = model_channels * mult + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks + ) or i < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else + SpatialTransformer( + ch, + num_heads, + dim_head, + depth=transformer_depth, + context_dim=context_dim, + disable_self_attn=disabled_sa)) + if level and i == self.num_res_blocks[level]: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) if resblock_updown else Upsample( + ch, conv_resample, dims=dims, out_channels=out_ch)) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module( + conv_nd(dims, model_channels, out_channels, 3, padding=1)), + ) + if self.predict_codebook_ids: + self.id_predictor = nn.Sequential( + normalization(ch), + conv_nd(dims, model_channels, n_embed, 1), + # nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits + ) + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + self.output_blocks.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """ + Convert the torso of the model to float32. + """ + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + self.output_blocks.apply(convert_module_to_f32) + + def forward(self, x, timesteps=None, context=None, y=None, **kwargs): + """ + Apply the model to an input batch. + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param context: conditioning plugged in via crossattn + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + assert (y is not None) == ( + self.num_classes is not None + ), 'must specify y if and only if the model is class-conditional' + hs = [] + t_emb = timestep_embedding( + timesteps, self.model_channels, repeat_only=False) # N + emb = self.time_embed(t_emb) # + + if self.num_classes is not None: + assert y.shape == (x.shape[0], ) + emb = emb + self.label_emb(y) + + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb, context) # conv + hs.append(h) + h = self.middle_block(h, emb, context) + for module in self.output_blocks: + h = th.cat([h, hs.pop()], dim=1) + h = module(h, emb, context) + h = h.type(x.dtype) + if self.predict_codebook_ids: + return self.id_predictor(h) + else: + return self.out(h) + + +class EncoderUNetModel(nn.Module): + """ + The half UNet model with attention and timestep embedding. + For usage, see UNet. + """ + + def __init__(self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + use_checkpoint=False, + use_fp16=False, + num_heads=1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + pool='adaptive', + *args, + **kwargs): + super().__init__() + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + self.num_res_blocks = num_res_blocks + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + self.input_blocks = nn.ModuleList([ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1)) + ]) + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for _ in range(num_res_blocks): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + )) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) if resblock_updown else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch)) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=num_head_channels, + use_new_attention_order=use_new_attention_order, + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + self.pool = pool + if pool == 'adaptive': + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + nn.AdaptiveAvgPool2d((1, 1)), + zero_module(conv_nd(dims, ch, out_channels, 1)), + nn.Flatten(), + ) + elif pool == 'attention': + assert num_head_channels != -1 + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + AttentionPool2d((image_size // ds), ch, num_head_channels, + out_channels), + ) + elif pool == 'spatial': + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + nn.ReLU(), + nn.Linear(2048, self.out_channels), + ) + elif pool == 'spatial_v2': + self.out = nn.Sequential( + nn.Linear(self._feature_size, 2048), + normalization(2048), + nn.SiLU(), + nn.Linear(2048, self.out_channels), + ) + else: + raise NotImplementedError(f'Unexpected {pool} pooling') + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """ + Convert the torso of the model to float32. + """ + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + + def forward(self, x, timesteps): + """ + Apply the model to an input batch. + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :return: an [N x K] Tensor of outputs. + """ + emb = self.time_embed( + timestep_embedding(timesteps, self.model_channels)) + + results = [] + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb) + if self.pool.startswith('spatial'): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = self.middle_block(h, emb) + if self.pool.startswith('spatial'): + results.append(h.type(x.dtype).mean(dim=(2, 3))) + h = th.cat(results, axis=-1) + return self.out(h) + else: + h = h.type(x.dtype) + return self.out(h) diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/util.py b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/util.py new file mode 100644 index 000000000..a63d05a3c --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/diffusionmodules/util.py @@ -0,0 +1,307 @@ +# adopted from +# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py +# and +# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py +# and +# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py +# +# thanks! + +import math +import os + +import numpy as np +import torch +import torch.nn as nn +from einops import repeat + +from modelscope.models.cv.image_to_3d.ldm.util import instantiate_from_config + + +def make_beta_schedule(schedule, + n_timestep, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3): + if schedule == 'linear': + betas = ( + torch.linspace( + linear_start**0.5, + linear_end**0.5, + n_timestep, + dtype=torch.float64)**2) + + elif schedule == 'cosine': + timesteps = ( + torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + + cosine_s) + alphas = timesteps / (1 + cosine_s) * np.pi / 2 + alphas = torch.cos(alphas).pow(2) + alphas = alphas / alphas[0] + betas = 1 - alphas[1:] / alphas[:-1] + betas = np.clip(betas, a_min=0, a_max=0.999) + + elif schedule == 'sqrt_linear': + betas = torch.linspace( + linear_start, linear_end, n_timestep, dtype=torch.float64) + elif schedule == 'sqrt': + betas = torch.linspace( + linear_start, linear_end, n_timestep, dtype=torch.float64)**0.5 + else: + raise ValueError(f"schedule '{schedule}' unknown.") + return betas.numpy() + + +def make_ddim_timesteps(ddim_discr_method, + num_ddim_timesteps, + num_ddpm_timesteps, + verbose=True): + if ddim_discr_method == 'uniform': + c = num_ddpm_timesteps // num_ddim_timesteps + ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) + elif ddim_discr_method == 'quad': + ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), + num_ddim_timesteps))**2).astype(int) + else: + raise NotImplementedError( + f'There is no ddim discretization method called "{ddim_discr_method}"' + ) + + # assert ddim_timesteps.shape[0] == num_ddim_timesteps + # add one to get the final alpha values right (the ones from first scale to data during sampling) + steps_out = ddim_timesteps + 1 + if verbose: + print(f'Selected timesteps for ddim sampler: {steps_out}') + return steps_out + + +def make_ddim_sampling_parameters(alphacums, + ddim_timesteps, + eta, + verbose=True): + # select alphas for computing the variance schedule + alphas = alphacums[ddim_timesteps] + alphas_prev = np.asarray([alphacums[0]] + + alphacums[ddim_timesteps[:-1]].tolist()) + + # according the the formula provided in https://arxiv.org/abs/2010.02502 + sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * # noqa + (1 - alphas / alphas_prev)) # noqa + if verbose: + print( + f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}' + ) + print( + f'For the chosen value of eta, which is {eta}, ' + f'this results in the following sigma_t schedule for ddim sampler {sigmas}' + ) + return sigmas, alphas, alphas_prev + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(-1, t) + return out.reshape(b, *((1, ) * (len(x_shape) - 1))) + + +def checkpoint(func, inputs, params, flag): + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(torch.autograd.Function): + + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + + with torch.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [ + x.detach().requires_grad_(True) for x in ctx.input_tensors + ] + with torch.enable_grad(): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = torch.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads + + +def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): + """ + Create sinusoidal timestep embeddings. + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + if not repeat_only: + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) + * torch.arange(start=0, end=half, dtype=torch.float32) + / half).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat( + [embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + else: + embedding = repeat(timesteps, 'b -> b d', d=dim) + return embedding + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """ + Make a standard normalization layer. + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +# PyTorch 1.7 has SiLU, but we support PyTorch 1.5. +class SiLU(nn.Module): + + def forward(self, x): + return x * torch.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + + def forward(self, x): + return super().forward(x.float()).type(x.dtype) + + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f'unsupported dimensions: {dims}') + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f'unsupported dimensions: {dims}') + + +class HybridConditioner(nn.Module): + + def __init__(self, c_concat_config, c_crossattn_config): + super().__init__() + self.concat_conditioner = instantiate_from_config(c_concat_config) + self.crossattn_conditioner = instantiate_from_config( + c_crossattn_config) + + def forward(self, c_concat, c_crossattn): + c_concat = self.concat_conditioner(c_concat) + c_crossattn = self.crossattn_conditioner(c_crossattn) + return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]} + + +def noise_like(shape, device, repeat=False): + + def repeat_noise(): + return torch.randn((1, *shape[1:]), + device=device).repeat(shape[0], + *((1, ) * (len(shape) - 1))) + + def noise(): + return torch.randn(shape, device=device) + + return repeat_noise() if repeat else noise() diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/distributions/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/distributions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/distributions/distributions.py b/modelscope/models/cv/image_to_3d/ldm/modules/distributions/distributions.py new file mode 100644 index 000000000..24cbbbc89 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/distributions/distributions.py @@ -0,0 +1,95 @@ +import numpy as np +import torch + + +class AbstractDistribution: + + def sample(self): + raise NotImplementedError() + + def mode(self): + raise NotImplementedError() + + +class DiracDistribution(AbstractDistribution): + + def __init__(self, value): + self.value = value + + def sample(self): + return self.value + + def mode(self): + return self.value + + +class DiagonalGaussianDistribution(object): + + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like( + self.mean).to(device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn( + self.mean.shape).to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * torch.sum( + torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar, + dim=[1, 2, 3]) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=[1, 2, 3]) + + def nll(self, sample, dims=[1, 2, 3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + (source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/ + guided_diffusion/losses.py#L12) + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, 'at least one argument must be a Tensor' + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + # noqa + ((mean1 - mean2)**2) * torch.exp(-logvar2)) # noqa diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/__init__.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/__init__.py new file mode 100644 index 000000000..dcc561953 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/__init__.py @@ -0,0 +1 @@ +from .clip import * diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/clip.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/clip.py new file mode 100644 index 000000000..413452498 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/clip.py @@ -0,0 +1,251 @@ +import hashlib +import os +import urllib +import warnings +from typing import Any, List, Union + +import packaging +import torch +from PIL import Image +from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize, + ToTensor) +from tqdm import tqdm + +from modelscope.models.cv.image_to_3d.ldm.modules.encoders.clip.model import \ + build_model + +try: + from torchvision.transforms import InterpolationMode + BICUBIC = InterpolationMode.BICUBIC +except ImportError: + BICUBIC = Image.BICUBIC + +if packaging.version.parse( + torch.__version__) < packaging.version.parse('1.7.1'): + warnings.warn('PyTorch version 1.7.1 or higher is recommended') + +__all__ = ['available_models', 'load'] + +_MODELS = { + 'RN50': + 'https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/\ + RN50.pt', + 'RN101': + 'https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/\ + RN101.pt', + 'RN50x4': + 'https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/\ + RN50x4.pt', + 'RN50x16': + 'https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/\ + RN50x16.pt', + 'RN50x64': + 'https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/\ + RN50x64.pt', + 'ViT-B/32': + 'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/\ + ViT-B-32.pt', + 'ViT-B/16': + 'https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/\ + ViT-B-16.pt', + 'ViT-L/14': + 'https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/\ + ViT-L-14.pt', + 'ViT-L/14@336px': + 'https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/\ + ViT-L-14-336px.pt', +} + + +def _download(url: str, root: str): + os.makedirs(root, exist_ok=True) + filename = os.path.basename(url) + + expected_sha256 = url.split('/')[-2] + download_target = os.path.join(root, filename) + + if os.path.exists(download_target) and not os.path.isfile(download_target): + raise RuntimeError( + f'{download_target} exists and is not a regular file') + + if os.path.isfile(download_target): + if hashlib.sha256(open(download_target, + 'rb').read()).hexdigest() == expected_sha256: + return download_target + else: + warnings.warn( + f'{download_target} exists, but the SHA256 checksum does not match; re-downloading the file' + ) + + with urllib.request.urlopen(url) as source, open(download_target, + 'wb') as output: + with tqdm( + total=int(source.info().get('Content-Length')), + ncols=80, + unit='iB', + unit_scale=True, + unit_divisor=1024) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + if hashlib.sha256(open(download_target, + 'rb').read()).hexdigest() != expected_sha256: + raise RuntimeError( + 'Model has been downloaded but the SHA256 checksum does not not match' + ) + + return download_target + + +def _convert_image_to_rgb(image): + return image.convert('RGB') + + +def _transform(n_px): + return Compose([ + Resize(n_px, interpolation=BICUBIC), + CenterCrop(n_px), + _convert_image_to_rgb, + ToTensor(), + Normalize((0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711)), + ]) + + +def available_models() -> List[str]: + """Returns the names of available CLIP models""" + return list(_MODELS.keys()) + + +def load(name: str, + device: Union[str, torch.device] = 'cuda' + if torch.cuda.is_available() else 'cpu', + jit: bool = False, + download_root: str = None): + """Load a CLIP model + + Parameters + ---------- + name : str + A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict + + device : Union[str, torch.device] + The device to put the loaded model + + jit : bool + Whether to load the optimized JIT model or more hackable non-JIT model (default). + + download_root: str + path to download the model files; by default, it uses "~/.cache/clip" + + Returns + ------- + model : torch.nn.Module + The CLIP model + + preprocess : Callable[[PIL.Image], torch.Tensor] + A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input + """ + if name in _MODELS: + model_path = _download( + _MODELS[name], download_root + or os.path.expanduser('~/.cache/clip')) + elif os.path.isfile(name): + model_path = name + else: + raise RuntimeError( + f'Model {name} not found; available models = {available_models()}') + + with open(model_path, 'rb') as opened_file: + try: + # loading JIT archive + model = torch.jit.load( + opened_file, map_location=device if jit else 'cpu').eval() + state_dict = None + except RuntimeError: + # loading saved state dict + if jit: + warnings.warn( + f'File {model_path} is not a JIT archive. Loading as a state dict instead' + ) + jit = False + state_dict = torch.load(opened_file, map_location='cpu') + + if not jit: + model = build_model(state_dict or model.state_dict()).to(device) + if str(device) == 'cpu': + model.float() + return model, _transform(model.visual.input_resolution) + + # patch the device names + device_holder = torch.jit.trace( + lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) + device_node = [ + n for n in device_holder.graph.findAllNodes('prim::Constant') + if 'Device' in repr(n) + ][-1] + + def _node_get(node: torch._C.Node, key: str): + """Gets attributes of a node which is polymorphic over return type. + + From https://github.com/pytorch/pytorch/pull/82628 + """ + sel = node.kindOf(key) + return getattr(node, sel)(key) + + def patch_device(module): + try: + graphs = [module.graph] if hasattr(module, 'graph') else [] + except RuntimeError: + graphs = [] + + if hasattr(module, 'forward1'): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes('prim::Constant'): + if 'value' in node.attributeNames() and str( + _node_get(node, 'value')).startswith('cuda'): + node.copyAttributes(device_node) + + model.apply(patch_device) + patch_device(model.encode_image) + patch_device(model.encode_text) + + # patch dtype to float32 on CPU + if str(device) == 'cpu': + float_holder = torch.jit.trace( + lambda: torch.ones([]).float(), example_inputs=[]) + float_input = list(float_holder.graph.findNode('aten::to').inputs())[1] + float_node = float_input.node() + + def patch_float(module): + try: + graphs = [module.graph] if hasattr(module, 'graph') else [] + except RuntimeError: + graphs = [] + + if hasattr(module, 'forward1'): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes('aten::to'): + inputs = list(node.inputs()) + for i in [ + 1, 2 + ]: # dtype can be the second or third argument to aten::to() + if _node_get(inputs[i].node(), 'value') == 5: + inputs[i].node().copyAttributes(float_node) + + model.apply(patch_float) + patch_float(model.encode_image) + patch_float(model.encode_text) + + model.float() + + return model, _transform(model.input_resolution.item()) diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/model.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/model.py new file mode 100644 index 000000000..c3d0471f5 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/model.py @@ -0,0 +1,511 @@ +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.relu1 = nn.ReLU(inplace=True) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu2 = nn.ReLU(inplace=True) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu3 = nn.ReLU(inplace=True) + + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict([('-1', nn.AvgPool2d(stride)), + ('0', + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False)), + ('1', nn.BatchNorm2d(planes * self.expansion))])) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu1(self.bn1(self.conv1(x))) + out = self.relu2(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu3(out) + return out + + +class AttentionPool2d(nn.Module): + + def __init__(self, + spacial_dim: int, + embed_dim: int, + num_heads: int, + output_dim: int = None): + super().__init__() + self.positional_embedding = nn.Parameter( + torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + + def forward(self, x): + x = x.flatten(start_dim=2).permute(2, 0, 1) # NCHW -> (HW)NC + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + x = x + self.positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x[:1], + key=x, + value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat( + [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False) + return x.squeeze(0) + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, + layers, + output_dim, + heads, + input_resolution=224, + width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(width // 2) + self.relu1 = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(width // 2) + self.relu2 = nn.ReLU(inplace=True) + self.conv3 = nn.Conv2d( + width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.relu3 = nn.ReLU(inplace=True) + self.avgpool = nn.AvgPool2d(2) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, + heads, output_dim) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x): + + def stem(x): + x = self.relu1(self.bn1(self.conv1(x))) + x = self.relu2(self.bn2(self.conv2(x))) + x = self.relu3(self.bn3(self.conv3(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + x = self.attnpool(x) + + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, + d_model: int, + n_head: int, + attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), + ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor): + self.attn_mask = self.attn_mask.to( + dtype=x.dtype, + device=x.device) if self.attn_mask is not None else None + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] + + def forward(self, x: torch.Tensor): + x = x + self.attention(self.ln_1(x)) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + + def __init__(self, + width: int, + layers: int, + heads: int, + attn_mask: torch.Tensor = None): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ + ResidualAttentionBlock(width, heads, attn_mask) + for _ in range(layers) + ]) + + def forward(self, x: torch.Tensor): + return self.resblocks(x) + + +class VisionTransformer(nn.Module): + + def __init__(self, input_resolution: int, patch_size: int, width: int, + layers: int, heads: int, output_dim: int): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + scale = width**-0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter(scale * torch.randn( + (input_resolution // patch_size)**2 + 1, width)) + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + def forward(self, x: torch.Tensor): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + torch_zeros = torch.zeros( + x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device) + x = torch.cat([self.class_embedding.to(x.dtype) + torch_zeros, x], + dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + +class CLIP(nn.Module): + + def __init__( + self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int): + super().__init__() + + self.context_length = context_length + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width) + else: + vision_heads = vision_width // 64 + self.visual = VisionTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim) + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask()) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, transformer_width)) + self.ln_final = LayerNorm(transformer_width) + + self.text_projection = nn.Parameter( + torch.empty(transformer_width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features**-0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [ + self.visual.layer1, self.visual.layer2, self.visual.layer3, + self.visual.layer4 + ]: + for name, param in resnet_block.named_parameters(): + if name.endswith('bn3.weight'): + nn.init.zeros_(param) + + proj_std = (self.transformer.width**-0.5) * ( + (2 * self.transformer.layers)**-0.5) + attn_std = self.transformer.width**-0.5 + fc_std = (2 * self.transformer.width)**-0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_( + self.text_projection, std=self.transformer.width**-0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float('-inf')) + mask.triu_(1) # zero out the lower diagonal + return mask + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def encode_image(self, image): + return self.visual(image.type(self.dtype)) + + def encode_text(self, text): + x = self.token_embedding(text).type( + self.dtype) # [batch_size, n_ctx, d_model] + + x = x + self.positional_embedding.type(self.dtype) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x).type(self.dtype) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), + text.argmax(dim=-1)] @ self.text_projection + + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm( + dim=1, keepdim=True) + text_features = text_features / text_features.norm(dim=1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logits_per_image.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(layer): + if isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Linear)): + layer.weight.data = layer.weight.data.half() + if layer.bias is not None: + layer.bias.data = layer.bias.data.half() + + if isinstance(layer, nn.MultiheadAttention): + for attr in [ + *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']], + 'in_proj_bias', 'bias_k', 'bias_v' + ]: + tensor = getattr(layer, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ['text_projection', 'proj']: + if hasattr(layer, name): + attr = getattr(layer, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model(state_dict: dict): + vit = 'visual.proj' in state_dict + + if vit: + vision_width = state_dict['visual.conv1.weight'].shape[0] + vision_layers = len([ + k for k in state_dict.keys() + if k.startswith('visual.') and k.endswith('.attn.in_proj_weight') + ]) + vision_patch_size = state_dict['visual.conv1.weight'].shape[-1] + grid_size = round( + (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5) + image_resolution = vision_patch_size * grid_size + else: + counts: list = [ + len( + set( + k.split('.')[2] for k in state_dict + if k.startswith(f'visual.layer{b}'))) + for b in [1, 2, 3, 4] + ] + vision_layers = tuple(counts) + vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0] + output_width = round( + (state_dict['visual.attnpool.positional_embedding'].shape[0] + - 1)**0.5) + vision_patch_size = None + assert output_width**2 + 1 == state_dict[ + 'visual.attnpool.positional_embedding'].shape[0] + image_resolution = output_width * 32 + + embed_dim = state_dict['text_projection'].shape[1] + context_length = state_dict['positional_embedding'].shape[0] + vocab_size = state_dict['token_embedding.weight'].shape[0] + transformer_width = state_dict['ln_final.weight'].shape[0] + transformer_heads = transformer_width // 64 + transformer_layers = len( + set( + k.split('.')[2] for k in state_dict + if k.startswith('transformer.resblocks'))) + + model = CLIP(embed_dim, image_resolution, vision_layers, vision_width, + vision_patch_size, context_length, vocab_size, + transformer_width, transformer_heads, transformer_layers) + + for key in ['input_resolution', 'context_length', 'vocab_size']: + if key in state_dict: + del state_dict[key] + + convert_weights(model) + model.load_state_dict(state_dict) + return model.eval() diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/simple_tokenizer.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/simple_tokenizer.py new file mode 100644 index 000000000..ffd0d0928 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/clip/simple_tokenizer.py @@ -0,0 +1,149 @@ +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re + + +@lru_cache() +def default_bpe(): + return os.path.join( + os.path.dirname(os.path.abspath(__file__)), + 'bpe_simple_vocab_16e6.txt.gz') + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord('!'), + ord('~') + 1)) + list(range( + ord('¡'), + ord('¬') + 1)) + list(range(ord('®'), + ord('ÿ') + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') + merges = merges[1:49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + '' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = { + '<|startoftext|>': '<|startoftext|>', + '<|endoftext|>': '<|endoftext|>' + } + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + '', ) + pairs = get_pairs(word) + + if not pairs: + return token + '' + + while True: + bigram = min( + pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except BaseException: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[ + i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] + for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] + for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode( + 'utf-8', errors='replace').replace('', ' ') + return text diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/encoders/modules.py b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/modules.py new file mode 100644 index 000000000..d8fbc03d9 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/encoders/modules.py @@ -0,0 +1,704 @@ +import random +from functools import partial + +import kornia +import kornia.augmentation as K +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision import transforms +from transformers import (CLIPTextModel, CLIPTokenizer, CLIPVisionModel, + T5EncoderModel, T5Tokenizer) + +from modelscope.models.cv.image_to_3d.ldm.modules.diffusionmodules.util import ( + extract_into_tensor, make_beta_schedule, noise_like) +# import clip +from modelscope.models.cv.image_to_3d.ldm.modules.encoders import clip +# TODO: can we directly rely on lucidrains code and simply add this as a reuirement? --> test +from modelscope.models.cv.image_to_3d.ldm.modules.x_transformer import ( + Encoder, TransformerWrapper) +from modelscope.models.cv.image_to_3d.ldm.thirdp.psp.id_loss import IDFeatures +from modelscope.models.cv.image_to_3d.ldm.util import (default, + instantiate_from_config) + + +class AbstractEncoder(nn.Module): + + def __init__(self): + super().__init__() + + def encode(self, *args, **kwargs): + raise NotImplementedError + + +class IdentityEncoder(AbstractEncoder): + + def encode(self, x): + return x + + +class FaceClipEncoder(AbstractEncoder): + + def __init__(self, augment=True, retreival_key=None): + super().__init__() + self.encoder = FrozenCLIPImageEmbedder() + self.augment = augment + self.retreival_key = retreival_key + + def forward(self, img): + encodings = [] + with torch.no_grad(): + x_offset = 125 + if self.retreival_key: + # Assumes retrieved image are packed into the second half of channels + face = img[:, 3:, 190:440, x_offset:(512 - x_offset)] + other = img[:, :3, ...].clone() + else: + face = img[:, :, 190:440, x_offset:(512 - x_offset)] + other = img.clone() + + if self.augment: + face = K.RandomHorizontalFlip()(face) + + other[:, :, 190:440, x_offset:(512 - x_offset)] *= 0 + encodings = [ + self.encoder.encode(face), + self.encoder.encode(other), + ] + + return torch.cat(encodings, dim=1) + + def encode(self, img): + if isinstance(img, list): + # Uncondition + return torch.zeros( + (1, 2, 768), + device=self.encoder.model.visual.conv1.weight.device) + + return self(img) + + +class FaceIdClipEncoder(AbstractEncoder): + + def __init__(self): + super().__init__() + self.encoder = FrozenCLIPImageEmbedder() + for p in self.encoder.parameters(): + p.requires_grad = False + self.id = FrozenFaceEncoder( + '/home/jpinkney/code/stable-diffusion/model_ir_se50.pth', + augment=True) + + def forward(self, img): + encodings = [] + with torch.no_grad(): + face = kornia.geometry.resize( + img, (256, 256), interpolation='bilinear', align_corners=True) + + other = img.clone() + other[:, :, 184:452, 122:396] *= 0 + encodings = [ + self.id.encode(face), + self.encoder.encode(other), + ] + + return torch.cat(encodings, dim=1) + + def encode(self, img): + if isinstance(img, list): + # Uncondition + return torch.zeros( + (1, 2, 768), + device=self.encoder.model.visual.conv1.weight.device) + + return self(img) + + +class ClassEmbedder(nn.Module): + + def __init__(self, embed_dim, n_classes=1000, key='class'): + super().__init__() + self.key = key + self.embedding = nn.Embedding(n_classes, embed_dim) + + def forward(self, batch, key=None): + if key is None: + key = self.key + # this is for use in crossattn + c = batch[key][:, None] + c = self.embedding(c) + return c + + +class TransformerEmbedder(AbstractEncoder): + """Some transformer encoder layers""" + + def __init__(self, + n_embed, + n_layer, + vocab_size, + max_seq_len=77, + device='cuda'): + super().__init__() + self.device = device + self.transformer = TransformerWrapper( + num_tokens=vocab_size, + max_seq_len=max_seq_len, + attn_layers=Encoder(dim=n_embed, depth=n_layer)) + + def forward(self, tokens): + tokens = tokens.to(self.device) # meh + z = self.transformer(tokens, return_embeddings=True) + return z + + def encode(self, x): + return self(x) + + +class BERTTokenizer(AbstractEncoder): + """ Uses a pretrained BERT tokenizer by huggingface. Vocab size: 30522 (?)""" + + def __init__(self, device='cuda', vq_interface=True, max_length=77): + super().__init__() + from transformers import BertTokenizerFast # TODO: add to reuquirements + self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased') + self.device = device + self.vq_interface = vq_interface + self.max_length = max_length + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding='max_length', + return_tensors='pt') + tokens = batch_encoding['input_ids'].to(self.device) + return tokens + + @torch.no_grad() + def encode(self, text): + tokens = self(text) + if not self.vq_interface: + return tokens + return None, None, [None, None, tokens] + + def decode(self, text): + return text + + +class BERTEmbedder(AbstractEncoder): + """Uses the BERT tokenizr model and add some transformer encoder layers""" + + def __init__(self, + n_embed, + n_layer, + vocab_size=30522, + max_seq_len=77, + device='cuda', + use_tokenizer=True, + embedding_dropout=0.0): + super().__init__() + self.use_tknz_fn = use_tokenizer + if self.use_tknz_fn: + self.tknz_fn = BERTTokenizer( + vq_interface=False, max_length=max_seq_len) + self.device = device + self.transformer = TransformerWrapper( + num_tokens=vocab_size, + max_seq_len=max_seq_len, + attn_layers=Encoder(dim=n_embed, depth=n_layer), + emb_dropout=embedding_dropout) + + def forward(self, text): + if self.use_tknz_fn: + tokens = self.tknz_fn(text) # .to(self.device) + else: + tokens = text + z = self.transformer(tokens, return_embeddings=True) + return z + + def encode(self, text): + # output of length 77 + return self(text) + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +class FrozenT5Embedder(AbstractEncoder): + """Uses the T5 transformer encoder for text""" + + def __init__(self, + version='google/t5-v1_1-large', + device='cuda', + max_length=77 + ): # others are google/t5-v1_1-xl and google/t5-v1_1-xxl + super().__init__() + self.tokenizer = T5Tokenizer.from_pretrained( + version, + cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models' + ) + self.transformer = T5EncoderModel.from_pretrained( + version, + cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models' + ) + self.device = device + self.max_length = max_length # TODO: typical value? + self.freeze() + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding='max_length', + return_tensors='pt') + tokens = batch_encoding['input_ids'].to(self.device) + outputs = self.transformer(input_ids=tokens) + + z = outputs.last_hidden_state + return z + + def encode(self, text): + return self(text) + + +class FrozenFaceEncoder(AbstractEncoder): + + def __init__(self, model_path, augment=False): + super().__init__() + self.loss_fn = IDFeatures(model_path) + # face encoder is frozen + for p in self.loss_fn.parameters(): + p.requires_grad = False + # Mapper is trainable + self.mapper = torch.nn.Linear(512, 768) + p = 0.25 + if augment: + self.augment = K.AugmentationSequential( + K.RandomHorizontalFlip(p=0.5), + K.RandomEqualize(p=p), + # K.RandomPlanckianJitter(p=p), + # K.RandomPlasmaBrightness(p=p), + # K.RandomPlasmaContrast(p=p), + # K.ColorJiggle(0.02, 0.2, 0.2, p=p), + ) + else: + self.augment = False + + def forward(self, img): + if isinstance(img, list): + # Uncondition + return torch.zeros((1, 1, 768), device=self.mapper.weight.device) + + if self.augment is not None: + # Transforms require 0-1 + img = self.augment((img + 1) / 2) + img = 2 * img - 1 + + feat = self.loss_fn(img, crop=True) + feat = self.mapper(feat.unsqueeze(1)) + return feat + + def encode(self, img): + return self(img) + + +class FrozenCLIPEmbedder(AbstractEncoder): + """Uses the CLIP transformer encoder for text (from huggingface)""" + + def __init__(self, + version='openai/clip-vit-large-patch14', + device='cuda', + max_length=77): # clip-vit-base-patch32 + super().__init__() + self.tokenizer = CLIPTokenizer.from_pretrained( + version, + cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models' + ) + self.transformer = CLIPTextModel.from_pretrained( + version, + cache_dir='/apdcephfs/private_rondyliu/projects/huggingface_models' + ) + self.device = device + self.max_length = max_length # TODO: typical value? + self.freeze() + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding='max_length', + return_tensors='pt') + tokens = batch_encoding['input_ids'].to(self.device) + outputs = self.transformer(input_ids=tokens) + + z = outputs.last_hidden_state + return z + + def encode(self, text): + return self(text) + + +class ClipImageProjector(AbstractEncoder): + """ + Uses the CLIP image encoder. + """ + + def __init__(self, + version='openai/clip-vit-large-patch14', + max_length=77): # clip-vit-base-patch32 + super().__init__() + self.model = CLIPVisionModel.from_pretrained(version) + self.model.train() + self.max_length = max_length # TODO: typical value? + self.antialias = True + self.mapper = torch.nn.Linear(1024, 768) + self.register_buffer( + 'mean', + torch.Tensor([0.48145466, 0.4578275, 0.40821073]), + persistent=False) + self.register_buffer( + 'std', + torch.Tensor([0.26862954, 0.26130258, 0.27577711]), + persistent=False) + null_cond = self.get_null_cond(version, max_length) + self.register_buffer('null_cond', null_cond) + + @torch.no_grad() + def get_null_cond(self, version, max_length): + device = self.mean.device + embedder = FrozenCLIPEmbedder( + version=version, device=device, max_length=max_length) + null_cond = embedder(['']) + return null_cond + + def preprocess(self, x): + # Expects inputs in the range -1, 1 + x = kornia.geometry.resize( + x, (224, 224), + interpolation='bicubic', + align_corners=True, + antialias=self.antialias) + x = (x + 1.) / 2. + # renormalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def forward(self, x): + if isinstance(x, list): + return self.null_cond + # x is assumed to be in range [-1,1] + x = self.preprocess(x) + outputs = self.model(pixel_values=x) + last_hidden_state = outputs.last_hidden_state + last_hidden_state = self.mapper(last_hidden_state) + return F.pad( + last_hidden_state, + [0, 0, 0, self.max_length - last_hidden_state.shape[1], 0, 0]) + + def encode(self, im): + return self(im) + + +class ProjectedFrozenCLIPEmbedder(AbstractEncoder): + + def __init__(self, + version='openai/clip-vit-large-patch14', + device='cuda', + max_length=77): # clip-vit-base-patch32 + super().__init__() + self.embedder = FrozenCLIPEmbedder( + version=version, device=device, max_length=max_length) + self.projection = torch.nn.Linear(768, 768) + + def forward(self, text): + z = self.embedder(text) + return self.projection(z) + + def encode(self, text): + return self(text) + + +class FrozenCLIPImageEmbedder(AbstractEncoder): + """ + Uses the CLIP image encoder. + Not actually frozen... If you want that set cond_stage_trainable=False in cfg + """ + + def __init__( + self, + model='ViT-L/14', + jit=False, + device='cpu', + antialias=False, + ): + super().__init__() + self.model, _ = clip.load(name=model, device=device, jit=jit) + # We don't use the text part so delete it + del self.model.transformer + self.antialias = antialias + self.register_buffer( + 'mean', + torch.Tensor([0.48145466, 0.4578275, 0.40821073]), + persistent=False) + self.register_buffer( + 'std', + torch.Tensor([0.26862954, 0.26130258, 0.27577711]), + persistent=False) + + def preprocess(self, x): + # Expects inputs in the range -1, 1 + x = kornia.geometry.resize( + x, (224, 224), + interpolation='bicubic', + align_corners=True, + antialias=self.antialias) + x = (x + 1.) / 2. + # renormalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def forward(self, x): + # x is assumed to be in range [-1,1] + if isinstance(x, list): + # [""] denotes condition dropout for ucg + device = self.model.visual.conv1.weight.device + return torch.zeros(1, 768, device=device) + return self.model.encode_image(self.preprocess(x)).float() + + def encode(self, im): + return self(im).unsqueeze(1) + + +class FrozenCLIPImageMutliEmbedder(AbstractEncoder): + """ + Uses the CLIP image encoder. + Not actually frozen... If you want that set cond_stage_trainable=False in cfg + """ + + def __init__( + self, + model='ViT-L/14', + jit=False, + device='cpu', + antialias=True, + max_crops=5, + ): + super().__init__() + self.model, _ = clip.load(name=model, device=device, jit=jit) + # We don't use the text part so delete it + del self.model.transformer + self.antialias = antialias + self.register_buffer( + 'mean', + torch.Tensor([0.48145466, 0.4578275, 0.40821073]), + persistent=False) + self.register_buffer( + 'std', + torch.Tensor([0.26862954, 0.26130258, 0.27577711]), + persistent=False) + self.max_crops = max_crops + + def preprocess(self, x): + + # Expects inputs in the range -1, 1 + randcrop = transforms.RandomResizedCrop( + 224, scale=(0.085, 1.0), ratio=(1, 1)) + max_crops = self.max_crops + patches = [] + crops = [randcrop(x) for _ in range(max_crops)] + patches.extend(crops) + x = torch.cat(patches, dim=0) + x = (x + 1.) / 2. + # renormalize according to clip + x = kornia.enhance.normalize(x, self.mean, self.std) + return x + + def forward(self, x): + # x is assumed to be in range [-1,1] + if isinstance(x, list): + # [""] denotes condition dropout for ucg + device = self.model.visual.conv1.weight.device + return torch.zeros(1, self.max_crops, 768, device=device) + batch_tokens = [] + for im in x: + patches = self.preprocess(im.unsqueeze(0)) + tokens = self.model.encode_image(patches).float() + for t in tokens: + if random.random() < 0.1: + t *= 0 + batch_tokens.append(tokens.unsqueeze(0)) + + return torch.cat(batch_tokens, dim=0) + + def encode(self, im): + return self(im) + + +class SpatialRescaler(nn.Module): + + def __init__(self, + n_stages=1, + method='bilinear', + multiplier=0.5, + in_channels=3, + out_channels=None, + bias=False): + super().__init__() + self.n_stages = n_stages + assert self.n_stages >= 0 + assert method in [ + 'nearest', 'linear', 'bilinear', 'trilinear', 'bicubic', 'area' + ] + self.multiplier = multiplier + self.interpolator = partial( + torch.nn.functional.interpolate, mode=method) + self.remap_output = out_channels is not None + if self.remap_output: + print( + f'Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing.' + ) + self.channel_mapper = nn.Conv2d( + in_channels, out_channels, 1, bias=bias) + + def forward(self, x): + for stage in range(self.n_stages): + x = self.interpolator(x, scale_factor=self.multiplier) + + if self.remap_output: + x = self.channel_mapper(x) + return x + + def encode(self, x): + return self(x) + + +class LowScaleEncoder(nn.Module): + + def __init__(self, + model_config, + linear_start, + linear_end, + timesteps=1000, + max_noise_level=250, + output_size=64, + scale_factor=1.0): + super().__init__() + self.max_noise_level = max_noise_level + self.model = instantiate_from_config(model_config) + self.augmentation_schedule = self.register_schedule( + timesteps=timesteps, + linear_start=linear_start, + linear_end=linear_end) + self.out_size = output_size + self.scale_factor = scale_factor + + def register_schedule(self, + beta_schedule='linear', + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3): + betas = make_beta_schedule( + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s) + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert alphas_cumprod.shape[ + 0] == self.num_timesteps, 'alphas have to be defined for each timestep' + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer('betas', to_torch(betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', + to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', + to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', + to_torch(np.sqrt(1. - alphas_cumprod))) + self.register_buffer('log_one_minus_alphas_cumprod', + to_torch(np.log(1. - alphas_cumprod))) + self.register_buffer('sqrt_recip_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', + to_torch(np.sqrt(1. / alphas_cumprod - 1))) + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) + * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, + x_start.shape) * noise) + + def forward(self, x): + z = self.model.encode(x).sample() + z = z * self.scale_factor + noise_level = torch.randint( + 0, self.max_noise_level, (x.shape[0], ), device=x.device).long() + z = self.q_sample(z, noise_level) + if self.out_size is not None: + z = torch.nn.functional.interpolate( + z, size=self.out_size, + mode='nearest') # TODO: experiment with mode + # z = z.repeat_interleave(2, -2).repeat_interleave(2, -1) + return z, noise_level + + def decode(self, z): + z = z / self.scale_factor + return self.model.decode(z) + + +if __name__ == '__main__': + from ldm.util import count_params + sentences = [ + 'a hedgehog drinking a whiskey', 'der mond ist aufgegangen', + "Ein Satz mit vielen Sonderzeichen: äöü ß ?! : 'xx-y/@s'" + ] + model = FrozenT5Embedder(version='google/t5-v1_1-xl').cuda() + count_params(model, True) + z = model(sentences) + print(z.shape) + + model = FrozenCLIPEmbedder().cuda() + count_params(model, True) + z = model(sentences) + print(z.shape) + + print('done.') diff --git a/modelscope/models/cv/image_to_3d/ldm/modules/x_transformer.py b/modelscope/models/cv/image_to_3d/ldm/modules/x_transformer.py new file mode 100644 index 000000000..0e5d7b8f7 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/modules/x_transformer.py @@ -0,0 +1,682 @@ +"""shout-out to https://github.com/lucidrains/x-transformers/tree/main/x_transformers""" +from collections import namedtuple +from functools import partial +from inspect import isfunction + +import torch +import torch.nn.functional as F +from einops import rearrange, reduce, repeat +from torch import einsum, nn + +# constants + +DEFAULT_DIM_HEAD = 64 + +Intermediates = namedtuple('Intermediates', + ['pre_softmax_attn', 'post_softmax_attn']) + +LayerIntermediates = namedtuple('Intermediates', + ['hiddens', 'attn_intermediates']) + + +class AbsolutePositionalEmbedding(nn.Module): + + def __init__(self, dim, max_seq_len): + super().__init__() + self.emb = nn.Embedding(max_seq_len, dim) + self.init_() + + def init_(self): + nn.init.normal_(self.emb.weight, std=0.02) + + def forward(self, x): + n = torch.arange(x.shape[1], device=x.device) + return self.emb(n)[None, :, :] + + +class FixedPositionalEmbedding(nn.Module): + + def __init__(self, dim): + super().__init__() + inv_freq = 1. / (10000**(torch.arange(0, dim, 2).float() / dim)) + self.register_buffer('inv_freq', inv_freq) + + def forward(self, x, seq_dim=1, offset=0): + t = torch.arange( + x.shape[seq_dim], device=x.device).type_as(self.inv_freq) + offset + sinusoid_inp = torch.einsum('i , j -> i j', t, self.inv_freq) + emb = torch.cat((sinusoid_inp.sin(), sinusoid_inp.cos()), dim=-1) + return emb[None, :, :] + + +# helpers + + +def exists(val): + return val is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def always(val): + + def inner(*args, **kwargs): + return val + + return inner + + +def not_equals(val): + + def inner(x): + return x != val + + return inner + + +def equals(val): + + def inner(x): + return x == val + + return inner + + +def max_neg_value(tensor): + return -torch.finfo(tensor.dtype).max + + +# keyword argument helpers + + +def pick_and_pop(keys, d): + values = list(map(lambda key: d.pop(key), keys)) + return dict(zip(keys, values)) + + +def group_dict_by_key(cond, d): + return_val = [dict(), dict()] + for key in d.keys(): + match = bool(cond(key)) + ind = int(not match) + return_val[ind][key] = d[key] + return (*return_val, ) + + +def string_begins_with(prefix, str): + return str.startswith(prefix) + + +def group_by_key_prefix(prefix, d): + return group_dict_by_key(partial(string_begins_with, prefix), d) + + +def groupby_prefix_and_trim(prefix, d): + kwargs_with_prefix, kwargs = group_dict_by_key( + partial(string_begins_with, prefix), d) + kwargs_without_prefix = dict( + map(lambda x: (x[0][len(prefix):], x[1]), + tuple(kwargs_with_prefix.items()))) + return kwargs_without_prefix, kwargs + + +# classes +class Scale(nn.Module): + + def __init__(self, value, fn): + super().__init__() + self.value = value + self.fn = fn + + def forward(self, x, **kwargs): + x, *rest = self.fn(x, **kwargs) + return (x * self.value, *rest) + + +class Rezero(nn.Module): + + def __init__(self, fn): + super().__init__() + self.fn = fn + self.g = nn.Parameter(torch.zeros(1)) + + def forward(self, x, **kwargs): + x, *rest = self.fn(x, **kwargs) + return (x * self.g, *rest) + + +class ScaleNorm(nn.Module): + + def __init__(self, dim, eps=1e-5): + super().__init__() + self.scale = dim**-0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(1)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class RMSNorm(nn.Module): + + def __init__(self, dim, eps=1e-8): + super().__init__() + self.scale = dim**-0.5 + self.eps = eps + self.g = nn.Parameter(torch.ones(dim)) + + def forward(self, x): + norm = torch.norm(x, dim=-1, keepdim=True) * self.scale + return x / norm.clamp(min=self.eps) * self.g + + +class Residual(nn.Module): + + def forward(self, x, residual): + return x + residual + + +class GRUGating(nn.Module): + + def __init__(self, dim): + super().__init__() + self.gru = nn.GRUCell(dim, dim) + + def forward(self, x, residual): + gated_output = self.gru( + rearrange(x, 'b n d -> (b n) d'), + rearrange(residual, 'b n d -> (b n) d')) + + return gated_output.reshape_as(x) + + +# feedforward + + +class GEGLU(nn.Module): + + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = nn.Sequential(nn.Linear( + dim, inner_dim), nn.GELU()) if not glu else GEGLU(dim, inner_dim) + + self.net = nn.Sequential(project_in, nn.Dropout(dropout), + nn.Linear(inner_dim, dim_out)) + + def forward(self, x): + return self.net(x) + + +# attention. +class Attention(nn.Module): + + def __init__(self, + dim, + dim_head=DEFAULT_DIM_HEAD, + heads=8, + causal=False, + mask=None, + talking_heads=False, + sparse_topk=None, + use_entmax15=False, + num_mem_kv=0, + dropout=0., + on_attn=False): + super().__init__() + if use_entmax15: + raise NotImplementedError( + 'Check out entmax activation instead of softmax activation!') + self.scale = dim_head**-0.5 + self.heads = heads + self.causal = causal + self.mask = mask + + inner_dim = dim_head * heads + + self.to_q = nn.Linear(dim, inner_dim, bias=False) + self.to_k = nn.Linear(dim, inner_dim, bias=False) + self.to_v = nn.Linear(dim, inner_dim, bias=False) + self.dropout = nn.Dropout(dropout) + + # talking heads + self.talking_heads = talking_heads + if talking_heads: + self.pre_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + self.post_softmax_proj = nn.Parameter(torch.randn(heads, heads)) + + # explicit topk sparse attention + self.sparse_topk = sparse_topk + + # entmax + # self.attn_fn = entmax15 if use_entmax15 else F.softmax + self.attn_fn = F.softmax + + # add memory key / values + self.num_mem_kv = num_mem_kv + if num_mem_kv > 0: + self.mem_k = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + self.mem_v = nn.Parameter(torch.randn(heads, num_mem_kv, dim_head)) + + # attention on attention + self.attn_on_attn = on_attn + self.to_out = nn.Sequential(nn.Linear( + inner_dim, dim + * 2), nn.GLU()) if on_attn else nn.Linear(inner_dim, dim) + + def forward(self, + x, + context=None, + mask=None, + context_mask=None, + rel_pos=None, + sinusoidal_emb=None, + prev_attn=None, + mem=None): + b, n, _, h, talking_heads, device = *x.shape, self.heads, self.talking_heads, x.device + kv_input = default(context, x) + + q_input = x + k_input = kv_input + v_input = kv_input + + if exists(mem): + k_input = torch.cat((mem, k_input), dim=-2) + v_input = torch.cat((mem, v_input), dim=-2) + + if exists(sinusoidal_emb): + # in shortformer, the query would start at a position offset depending on the past cached memory + offset = k_input.shape[-2] - q_input.shape[-2] + q_input = q_input + sinusoidal_emb(q_input, offset=offset) + k_input = k_input + sinusoidal_emb(k_input) + + q = self.to_q(q_input) + k = self.to_k(k_input) + v = self.to_v(v_input) + + q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=h), + (q, k, v)) + + input_mask = None + if any(map(exists, (mask, context_mask))): + q_mask = default(mask, lambda: torch.ones( + (b, n), device=device).bool()) + k_mask = q_mask if not exists(context) else context_mask + k_mask = default( + k_mask, lambda: torch.ones( + (b, k.shape[-2]), device=device).bool()) + q_mask = rearrange(q_mask, 'b i -> b () i ()') + k_mask = rearrange(k_mask, 'b j -> b () () j') + input_mask = q_mask * k_mask + + if self.num_mem_kv > 0: + mem_k, mem_v = map(lambda t: repeat(t, 'h n d -> b h n d', b=b), + (self.mem_k, self.mem_v)) + k = torch.cat((mem_k, k), dim=-2) + v = torch.cat((mem_v, v), dim=-2) + if exists(input_mask): + input_mask = F.pad( + input_mask, (self.num_mem_kv, 0), value=True) + + dots = einsum('b h i d, b h j d -> b h i j', q, k) * self.scale + mask_value = max_neg_value(dots) + + if exists(prev_attn): + dots = dots + prev_attn + + pre_softmax_attn = dots + + if talking_heads: + dots = einsum('b h i j, h k -> b k i j', dots, + self.pre_softmax_proj).contiguous() + + if exists(rel_pos): + dots = rel_pos(dots) + + if exists(input_mask): + dots.masked_fill_(~input_mask, mask_value) + del input_mask + + if self.causal: + i, j = dots.shape[-2:] + r = torch.arange(i, device=device) + mask = rearrange(r, 'i -> () () i ()') < rearrange( + r, 'j -> () () () j') + mask = F.pad(mask, (j - i, 0), value=False) + dots.masked_fill_(mask, mask_value) + del mask + + if exists(self.sparse_topk) and self.sparse_topk < dots.shape[-1]: + top, _ = dots.topk(self.sparse_topk, dim=-1) + vk = top[..., -1].unsqueeze(-1).expand_as(dots) + mask = dots < vk + dots.masked_fill_(mask, mask_value) + del mask + + attn = self.attn_fn(dots, dim=-1) + post_softmax_attn = attn + + attn = self.dropout(attn) + + if talking_heads: + attn = einsum('b h i j, h k -> b k i j', attn, + self.post_softmax_proj).contiguous() + + out = einsum('b h i j, b h j d -> b h i d', attn, v) + out = rearrange(out, 'b h n d -> b n (h d)') + + intermediates = Intermediates( + pre_softmax_attn=pre_softmax_attn, + post_softmax_attn=post_softmax_attn) + + return self.to_out(out), intermediates + + +class AttentionLayers(nn.Module): + + def __init__(self, + dim, + depth, + heads=8, + causal=False, + cross_attend=False, + only_cross=False, + use_scalenorm=False, + use_rmsnorm=False, + use_rezero=False, + rel_pos_num_buckets=32, + rel_pos_max_distance=128, + position_infused_attn=False, + custom_layers=None, + sandwich_coef=None, + par_ratio=None, + residual_attn=False, + cross_residual_attn=False, + macaron=False, + pre_norm=True, + gate_residual=False, + **kwargs): + super().__init__() + ff_kwargs, kwargs = groupby_prefix_and_trim('ff_', kwargs) + attn_kwargs, _ = groupby_prefix_and_trim('attn_', kwargs) + + # dim_head = attn_kwargs.get('dim_head', DEFAULT_DIM_HEAD) + + self.dim = dim + self.depth = depth + self.layers = nn.ModuleList([]) + + self.has_pos_emb = position_infused_attn + self.pia_pos_emb = FixedPositionalEmbedding( + dim) if position_infused_attn else None + self.rotary_pos_emb = always(None) + + assert rel_pos_num_buckets <= rel_pos_max_distance, 'number of relative position buckets must be less than \ + the relative position max distance' + + self.rel_pos = None + + self.pre_norm = pre_norm + + self.residual_attn = residual_attn + self.cross_residual_attn = cross_residual_attn + + norm_class = ScaleNorm if use_scalenorm else nn.LayerNorm + norm_class = RMSNorm if use_rmsnorm else norm_class + norm_fn = partial(norm_class, dim) + + norm_fn = nn.Identity if use_rezero else norm_fn + branch_fn = Rezero if use_rezero else None + + if cross_attend and not only_cross: + default_block = ('a', 'c', 'f') + elif cross_attend and only_cross: + default_block = ('c', 'f') + else: + default_block = ('a', 'f') + + if macaron: + default_block = ('f', ) + default_block + + if exists(custom_layers): + layer_types = custom_layers + elif exists(par_ratio): + par_depth = depth * len(default_block) + assert 1 < par_ratio <= par_depth, 'par ratio out of range' + default_block = tuple(filter(not_equals('f'), default_block)) + par_attn = par_depth // par_ratio + depth_cut = par_depth * 2 // 3 # 2 / 3 attention layer cutoff suggested by PAR paper + par_width = (depth_cut + depth_cut // par_attn) // par_attn + assert len( + default_block + ) <= par_width, 'default block is too large for par_ratio' + par_block = default_block + ('f', ) * ( + par_width - len(default_block)) + par_head = par_block * par_attn + layer_types = par_head + ('f', ) * (par_depth - len(par_head)) + elif exists(sandwich_coef): + assert sandwich_coef > 0 and sandwich_coef <= depth, 'sandwich coefficient should be less than the depth' + layer_types = ('a', ) * sandwich_coef + default_block * ( + depth - sandwich_coef) + ('f', ) * sandwich_coef + else: + layer_types = default_block * depth + + self.layer_types = layer_types + self.num_attn_layers = len(list(filter(equals('a'), layer_types))) + + for layer_type in self.layer_types: + if layer_type == 'a': + layer = Attention( + dim, heads=heads, causal=causal, **attn_kwargs) + elif layer_type == 'c': + layer = Attention(dim, heads=heads, **attn_kwargs) + elif layer_type == 'f': + layer = FeedForward(dim, **ff_kwargs) + layer = layer if not macaron else Scale(0.5, layer) + else: + raise Exception(f'invalid layer type {layer_type}') + + if isinstance(layer, Attention) and exists(branch_fn): + layer = branch_fn(layer) + + if gate_residual: + residual_fn = GRUGating(dim) + else: + residual_fn = Residual() + + self.layers.append(nn.ModuleList([norm_fn(), layer, residual_fn])) + + def forward(self, + x, + context=None, + mask=None, + context_mask=None, + mems=None, + return_hiddens=False): + hiddens = [] + intermediates = [] + prev_attn = None + prev_cross_attn = None + + mems = mems.copy() if exists(mems) else [None] * self.num_attn_layers + + for ind, (layer_type, (norm, block, residual_fn)) in enumerate( + zip(self.layer_types, self.layers)): + is_last = ind == (len(self.layers) - 1) + + if layer_type == 'a': + hiddens.append(x) + layer_mem = mems.pop(0) + + residual = x + + if self.pre_norm: + x = norm(x) + + if layer_type == 'a': + out, inter = block( + x, + mask=mask, + sinusoidal_emb=self.pia_pos_emb, + rel_pos=self.rel_pos, + prev_attn=prev_attn, + mem=layer_mem) + elif layer_type == 'c': + out, inter = block( + x, + context=context, + mask=mask, + context_mask=context_mask, + prev_attn=prev_cross_attn) + elif layer_type == 'f': + out = block(x) + + x = residual_fn(out, residual) + + if layer_type in ('a', 'c'): + intermediates.append(inter) + + if layer_type == 'a' and self.residual_attn: + prev_attn = inter.pre_softmax_attn + elif layer_type == 'c' and self.cross_residual_attn: + prev_cross_attn = inter.pre_softmax_attn + + if not self.pre_norm and not is_last: + x = norm(x) + + if return_hiddens: + intermediates = LayerIntermediates( + hiddens=hiddens, attn_intermediates=intermediates) + + return x, intermediates + + return x + + +class Encoder(AttentionLayers): + + def __init__(self, **kwargs): + assert 'causal' not in kwargs, 'cannot set causality on encoder' + super().__init__(causal=False, **kwargs) + + +class TransformerWrapper(nn.Module): + + def __init__(self, + *, + num_tokens, + max_seq_len, + attn_layers, + emb_dim=None, + max_mem_len=0., + emb_dropout=0., + num_memory_tokens=None, + tie_embedding=False, + use_pos_emb=True): + super().__init__() + assert isinstance( + attn_layers, AttentionLayers + ), 'attention layers must be one of Encoder or Decoder' + + dim = attn_layers.dim + emb_dim = default(emb_dim, dim) + + self.max_seq_len = max_seq_len + self.max_mem_len = max_mem_len + self.num_tokens = num_tokens + + self.token_emb = nn.Embedding(num_tokens, emb_dim) + self.pos_emb = AbsolutePositionalEmbedding(emb_dim, max_seq_len) if ( + use_pos_emb and not attn_layers.has_pos_emb) else always(0) + self.emb_dropout = nn.Dropout(emb_dropout) + + self.project_emb = nn.Linear(emb_dim, + dim) if emb_dim != dim else nn.Identity() + self.attn_layers = attn_layers + self.norm = nn.LayerNorm(dim) + + self.init_() + + self.to_logits = nn.Linear( + dim, num_tokens + ) if not tie_embedding else lambda t: t @ self.token_emb.weight.t() + + # memory tokens (like [cls]) from Memory Transformers paper + num_memory_tokens = default(num_memory_tokens, 0) + self.num_memory_tokens = num_memory_tokens + if num_memory_tokens > 0: + self.memory_tokens = nn.Parameter( + torch.randn(num_memory_tokens, dim)) + + # let funnel encoder know number of memory tokens, if specified + if hasattr(attn_layers, 'num_memory_tokens'): + attn_layers.num_memory_tokens = num_memory_tokens + + def init_(self): + nn.init.normal_(self.token_emb.weight, std=0.02) + + def forward(self, + x, + return_embeddings=False, + mask=None, + return_mems=False, + return_attn=False, + mems=None, + **kwargs): + b, _, _, num_mem = *x.shape, x.device, self.num_memory_tokens + x = self.token_emb(x) + x += self.pos_emb(x) + x = self.emb_dropout(x) + + x = self.project_emb(x) + + if num_mem > 0: + mem = repeat(self.memory_tokens, 'n d -> b n d', b=b) + x = torch.cat((mem, x), dim=1) + + # auto-handle masking after appending memory tokens + if exists(mask): + mask = F.pad(mask, (num_mem, 0), value=True) + + x, intermediates = self.attn_layers( + x, mask=mask, mems=mems, return_hiddens=True, **kwargs) + x = self.norm(x) + + mem, x = x[:, :num_mem], x[:, num_mem:] + + out = self.to_logits(x) if not return_embeddings else x + + if return_mems: + hiddens = intermediates.hiddens + new_mems = list( + map(lambda pair: torch.cat(pair, dim=-2), zip( + mems, hiddens))) if exists(mems) else hiddens + new_mems = list( + map(lambda t: t[..., -self.max_mem_len:, :].detach(), + new_mems)) + return out, new_mems + + if return_attn: + attn_maps = list( + map(lambda t: t.post_softmax_attn, + intermediates.attn_intermediates)) + return out, attn_maps + + return out diff --git a/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/helpers.py b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/helpers.py new file mode 100644 index 000000000..954db9cd5 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/helpers.py @@ -0,0 +1,133 @@ +# https://github.com/eladrich/pixel2style2pixel + +from collections import namedtuple + +import torch +from torch.nn import (AdaptiveAvgPool2d, BatchNorm2d, Conv2d, MaxPool2d, + Module, PReLU, ReLU, Sequential, Sigmoid) + +# ArcFace implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch) + + +class Flatten(Module): + + def forward(self, input): + return input.view(input.size(0), -1) + + +def l2_norm(input, axis=1): + norm = torch.norm(input, 2, axis, True) + output = torch.div(input, norm) + return output + + +class Bottleneck(namedtuple('Block', ['in_channel', 'depth', 'stride'])): + """ A named tuple describing a ResNet block. """ + + +def get_block(in_channel, depth, num_units, stride=2): + return [Bottleneck(in_channel, depth, stride) + ] + [Bottleneck(depth, depth, 1) for i in range(num_units - 1)] + + +def get_blocks(num_layers): + if num_layers == 50: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=4), + get_block(in_channel=128, depth=256, num_units=14), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 100: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=13), + get_block(in_channel=128, depth=256, num_units=30), + get_block(in_channel=256, depth=512, num_units=3) + ] + elif num_layers == 152: + blocks = [ + get_block(in_channel=64, depth=64, num_units=3), + get_block(in_channel=64, depth=128, num_units=8), + get_block(in_channel=128, depth=256, num_units=36), + get_block(in_channel=256, depth=512, num_units=3) + ] + else: + raise ValueError( + 'Invalid number of layers: {}. Must be one of [50, 100, 152]'. + format(num_layers)) + return blocks + + +class SEModule(Module): + + def __init__(self, channels, reduction): + super(SEModule, self).__init__() + self.avg_pool = AdaptiveAvgPool2d(1) + self.fc1 = Conv2d( + channels, + channels // reduction, + kernel_size=1, + padding=0, + bias=False) + self.relu = ReLU(inplace=True) + self.fc2 = Conv2d( + channels // reduction, + channels, + kernel_size=1, + padding=0, + bias=False) + self.sigmoid = Sigmoid() + + def forward(self, x): + module_input = x + x = self.avg_pool(x) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return module_input * x + + +class bottleneck_IR(Module): + + def __init__(self, in_channel, depth, stride): + super(bottleneck_IR, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential( + Conv2d(in_channel, depth, (1, 1), stride, bias=False), + BatchNorm2d(depth)) + self.res_layer = Sequential( + BatchNorm2d(in_channel), + Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), + PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False), + BatchNorm2d(depth)) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut + + +class bottleneck_IR_SE(Module): + + def __init__(self, in_channel, depth, stride): + super(bottleneck_IR_SE, self).__init__() + if in_channel == depth: + self.shortcut_layer = MaxPool2d(1, stride) + else: + self.shortcut_layer = Sequential( + Conv2d(in_channel, depth, (1, 1), stride, bias=False), + BatchNorm2d(depth)) + self.res_layer = Sequential( + BatchNorm2d(in_channel), + Conv2d(in_channel, depth, (3, 3), (1, 1), 1, bias=False), + PReLU(depth), Conv2d(depth, depth, (3, 3), stride, 1, bias=False), + BatchNorm2d(depth), SEModule(depth, 16)) + + def forward(self, x): + shortcut = self.shortcut_layer(x) + res = self.res_layer(x) + return res + shortcut diff --git a/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/id_loss.py b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/id_loss.py new file mode 100644 index 000000000..c6cb52bc7 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/id_loss.py @@ -0,0 +1,27 @@ +# https://github.com/eladrich/pixel2style2pixel +import torch +from torch import nn + +from modelscope.models.cv.image_to_3d.ldm.thirdp.psp.model_irse import Backbone + + +class IDFeatures(nn.Module): + + def __init__(self, model_path): + super(IDFeatures, self).__init__() + print('Loading ResNet ArcFace') + self.facenet = Backbone( + input_size=112, num_layers=50, drop_ratio=0.6, mode='ir_se') + self.facenet.load_state_dict( + torch.load(model_path, map_location='cpu')) + self.face_pool = torch.nn.AdaptiveAvgPool2d((112, 112)) + self.facenet.eval() + + def forward(self, x, crop=False): + # Not sure of the image range here + if crop: + x = torch.nn.functional.interpolate(x, (256, 256), mode='area') + x = x[:, :, 35:223, 32:220] + x = self.face_pool(x) + x_feats = self.facenet(x) + return x_feats diff --git a/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/model_irse.py b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/model_irse.py new file mode 100644 index 000000000..f3d6deab3 --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/thirdp/psp/model_irse.py @@ -0,0 +1,96 @@ +# https://github.com/eladrich/pixel2style2pixel + +from torch.nn import (BatchNorm1d, BatchNorm2d, Conv2d, Dropout, Linear, + Module, PReLU, Sequential) + +from modelscope.models.cv.image_to_3d.ldm.thirdp.psp.helpers import ( + Flatten, bottleneck_IR, bottleneck_IR_SE, get_blocks, l2_norm) + +# Modified Backbone implementation from [TreB1eN](https://github.com/TreB1eN/InsightFace_Pytorch) + + +class Backbone(Module): + + def __init__(self, + input_size, + num_layers, + mode='ir', + drop_ratio=0.4, + affine=True): + super(Backbone, self).__init__() + assert input_size in [112, 224], 'input_size should be 112 or 224' + assert num_layers in [50, 100, + 152], 'num_layers should be 50, 100 or 152' + assert mode in ['ir', 'ir_se'], 'mode should be ir or ir_se' + blocks = get_blocks(num_layers) + if mode == 'ir': + unit_module = bottleneck_IR + elif mode == 'ir_se': + unit_module = bottleneck_IR_SE + self.input_layer = Sequential( + Conv2d(3, 64, (3, 3), 1, 1, bias=False), BatchNorm2d(64), + PReLU(64)) + if input_size == 112: + self.output_layer = Sequential( + BatchNorm2d(512), Dropout(drop_ratio), Flatten(), + Linear(512 * 7 * 7, 512), BatchNorm1d(512, affine=affine)) + else: + self.output_layer = Sequential( + BatchNorm2d(512), Dropout(drop_ratio), Flatten(), + Linear(512 * 14 * 14, 512), BatchNorm1d(512, affine=affine)) + + modules = [] + for block in blocks: + for bottleneck in block: + modules.append( + unit_module(bottleneck.in_channel, bottleneck.depth, + bottleneck.stride)) + self.body = Sequential(*modules) + + def forward(self, x): + x = self.input_layer(x) + x = self.body(x) + x = self.output_layer(x) + return l2_norm(x) + + +def IR_50(input_size): + """Constructs a ir-50 model.""" + model = Backbone( + input_size, num_layers=50, mode='ir', drop_ratio=0.4, affine=False) + return model + + +def IR_101(input_size): + """Constructs a ir-101 model.""" + model = Backbone( + input_size, num_layers=100, mode='ir', drop_ratio=0.4, affine=False) + return model + + +def IR_152(input_size): + """Constructs a ir-152 model.""" + model = Backbone( + input_size, num_layers=152, mode='ir', drop_ratio=0.4, affine=False) + return model + + +def IR_SE_50(input_size): + """Constructs a ir_se-50 model.""" + model = Backbone( + input_size, num_layers=50, mode='ir_se', drop_ratio=0.4, affine=False) + return model + + +def IR_SE_101(input_size): + """Constructs a ir_se-101 model.""" + model = Backbone( + input_size, num_layers=100, mode='ir_se', drop_ratio=0.4, affine=False) + return model + + +def IR_SE_152(input_size): + """Constructs a ir_se-152 model.""" + model = Backbone( + input_size, num_layers=152, mode='ir_se', drop_ratio=0.4, affine=False) + return model diff --git a/modelscope/models/cv/image_to_3d/ldm/util.py b/modelscope/models/cv/image_to_3d/ldm/util.py new file mode 100644 index 000000000..83ac20a3e --- /dev/null +++ b/modelscope/models/cv/image_to_3d/ldm/util.py @@ -0,0 +1,302 @@ +import importlib +import os +import time +from inspect import isfunction + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +import PIL +import torch +import torchvision +from PIL import Image, ImageDraw, ImageFont +from torch import optim + + +def pil_rectangle_crop(im): + width, height = im.size # Get dimensions + + if width <= height: + left = 0 + right = width + top = (height - width) / 2 + bottom = (height + width) / 2 + else: + + top = 0 + bottom = height + left = (width - height) / 2 + bottom = (width + height) / 2 + + # Crop the center of the image + im = im.crop((left, top, right, bottom)) + return im + + +def add_margin(pil_img, color=0, size=256): + width, height = pil_img.size + result = Image.new(pil_img.mode, (size, size), color) + result.paste(pil_img, ((size - width) // 2, (size - height) // 2)) + return result + + +def create_carvekit_interface(): + from carvekit.api.high import HiInterface + # Check doc strings for more information + interface = HiInterface( + object_type='object', # Can be "object" or "hairs-like". + batch_size_seg=5, + batch_size_matting=1, + device='cuda' if torch.cuda.is_available() else 'cpu', + seg_mask_size=640, # Use 640 for Tracer B7 and 320 for U2Net + matting_mask_size=2048, + trimap_prob_threshold=231, + trimap_dilation=30, + trimap_erosion_iters=5, + fp16=False) + + return interface + + +def load_and_preprocess(interface, input_im): + ''' + :param input_im (PIL Image). + :return image (H, W, 3) array in [0, 1]. + ''' + # See https://github.com/Ir1d/image-background-remove-tool + image = input_im.convert('RGB') + + image_without_background = interface([image])[0] + image_without_background = np.array(image_without_background) + est_seg = image_without_background > 127 + image = np.array(image) + foreground = est_seg[:, :, -1].astype(np.bool_) + image[~foreground] = [255., 255., 255.] + x, y, w, h = cv2.boundingRect(foreground.astype(np.uint8)) + image = image[y:y + h, x:x + w, :] + image = PIL.Image.fromarray(np.array(image)) + + # resize image such that long edge is 512 + image.thumbnail([200, 200], Image.LANCZOS) + image = add_margin(image, (255, 255, 255), size=256) + image = np.array(image) + + return image + + +def log_txt_as_img(wh, xc, size=10): + # wh a tuple of (width, height) + # xc a list of captions to plot + b = len(xc) + txts = list() + for bi in range(b): + txt = Image.new('RGB', wh, color='white') + draw = ImageDraw.Draw(txt) + font = ImageFont.truetype('data/DejaVuSans.ttf', size=size) + nc = int(40 * (wh[0] / 256)) + lines = '\n'.join(xc[bi][start:start + nc] + for start in range(0, len(xc[bi]), nc)) + + try: + draw.text((0, 0), lines, fill='black', font=font) + except UnicodeEncodeError: + print('Cant encode string for logging. Skipping.') + + txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0 + txts.append(txt) + txts = np.stack(txts) + txts = torch.tensor(txts) + return txts + + +def ismap(x): + if not isinstance(x, torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] > 3) + + +def isimage(x): + if not isinstance(x, torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def mean_flat(tensor): + """ + https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86 + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def count_params(model, verbose=False): + total_params = sum(p.numel() for p in model.parameters()) + if verbose: + print( + f'{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.' + ) + return total_params + + +def instantiate_from_config(config): + if 'target' not in config: + if config == '__is_first_stage__': + return None + elif config == '__is_unconditional__': + return None + raise KeyError('Expected key `target` to instantiate.') + return get_obj_from_str(config['target'])(**config.get('params', dict())) + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit('.', 1) + print(module) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +class AdamWwithEMAandWings(optim.Optimizer): + # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298 + def __init__( + self, # noqa + params, # noqa + lr=1.e-3, # noqa + betas=(0.9, 0.999), # noqa + eps=1.e-8, # noqa + weight_decay=1.e-2, # noqa + amsgrad=False, # noqa + ema_decay=0.9999, # ema decay to match previous code # noqa + ema_power=1., # noqa + param_names=()): # noqa + # TODO: check hyperparameters before using + """AdamW that saves EMA versions of the parameters.""" + if not 0.0 <= lr: + raise ValueError('Invalid learning rate: {}'.format(lr)) + if not 0.0 <= eps: + raise ValueError('Invalid epsilon value: {}'.format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError('Invalid beta parameter at index 0: {}'.format( + betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError('Invalid beta parameter at index 1: {}'.format( + betas[1])) + if not 0.0 <= weight_decay: + raise ValueError( + 'Invalid weight_decay value: {}'.format(weight_decay)) + if not 0.0 <= ema_decay <= 1.0: + raise ValueError('Invalid ema_decay value: {}'.format(ema_decay)) + defaults = dict( + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + amsgrad=amsgrad, + ema_decay=ema_decay, + ema_power=ema_power, + param_names=param_names) + super().__init__(params, defaults) + + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + ema_params_with_grad = [] + # state_sums = [] + max_exp_avg_sqs = [] + state_steps = [] + amsgrad = group['amsgrad'] + beta1, beta2 = group['betas'] + ema_decay = group['ema_decay'] + ema_power = group['ema_power'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + if p.grad.is_sparse: + raise RuntimeError( + 'AdamW does not support sparse gradients') + grads.append(p.grad) + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like( + p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like( + p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like( + p, memory_format=torch.preserve_format) + # Exponential moving average of parameter values + state['param_exp_avg'] = p.detach().float().clone() + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + ema_params_with_grad.append(state['param_exp_avg']) + + if amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + # update the steps for each param group update + state['step'] += 1 + # record the step after step update + state_steps.append(state['step']) + + optim._functional.adamw( + params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps'], + maximize=False) + + cur_ema_decay = min(ema_decay, 1 - state['step']**-ema_power) + for param, ema_param in zip(params_with_grad, + ema_params_with_grad): + ema_param.mul_(cur_ema_decay).add_( + param.float(), alpha=1 - cur_ema_decay) + + return loss diff --git a/modelscope/models/cv/image_try_on/generator.py b/modelscope/models/cv/image_try_on/generator.py index 47e2bc1a5..1b1552cc2 100644 --- a/modelscope/models/cv/image_try_on/generator.py +++ b/modelscope/models/cv/image_try_on/generator.py @@ -1,5 +1,5 @@ # The implementation here is modified based on spade, -# originally Apache 2.0 License and publicly avaialbe at https://github.com/NVlabs/SPADE +# originally Apache 2.0 License and publicly available at https://github.com/NVlabs/SPADE import functools import os diff --git a/modelscope/models/cv/image_try_on/landmark.py b/modelscope/models/cv/image_try_on/landmark.py index f74416d54..489e59c30 100644 --- a/modelscope/models/cv/image_try_on/landmark.py +++ b/modelscope/models/cv/image_try_on/landmark.py @@ -1,5 +1,5 @@ # The implementation here is modified based on hrnet, -# originally Apache 2.0 License and publicly avaialbe at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation +# originally Apache 2.0 License and publicly available at https://github.com/HRNet/HigherHRNet-Human-Pose-Estimation import logging import os diff --git a/modelscope/models/cv/image_try_on/warping.py b/modelscope/models/cv/image_try_on/warping.py index 6c9cf18cd..c0116e01b 100644 --- a/modelscope/models/cv/image_try_on/warping.py +++ b/modelscope/models/cv/image_try_on/warping.py @@ -1,5 +1,5 @@ # The implementation here is modified based on flow-style-vton, -# originally Apache 2.0 License and publicly avaialbe at https://github.com/SenHe/Flow-Style-VTON +# originally Apache 2.0 License and publicly available at https://github.com/SenHe/Flow-Style-VTON from collections import OrderedDict from math import sqrt diff --git a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py index 645821405..b97926884 100644 --- a/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py +++ b/modelscope/models/cv/referring_video_object_segmentation/utils/postprocessing.py @@ -109,7 +109,7 @@ def forward(self, outputs, videos_metadata, samples_shape_with_padding): 1) # remove the padding # resize the masks back to their original frames dataset size for evaluation: original_frames_size = video_metadata['original_frame_size'] - tuple_size = tuple(original_frames_size.cpu().numpy()) + tuple_size = tuple(original_frames_size.cpu()) video_pred_masks = F.interpolate( video_pred_masks.float(), size=tuple_size, mode='nearest') video_pred_masks = video_pred_masks.to(torch.uint8).cpu() diff --git a/modelscope/models/cv/self_supervised_depth_completion/__init__.py b/modelscope/models/cv/self_supervised_depth_completion/__init__.py new file mode 100644 index 000000000..e8e8e4cf7 --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/__init__.py @@ -0,0 +1,21 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import TYPE_CHECKING + +from modelscope.utils.import_utils import LazyImportModule + +if TYPE_CHECKING: + from .self_supervised_depth_completion import SelfSupervisedDepthCompletion +else: + _import_structure = { + 'selfsuperviseddepthcompletion': ['SelfSupervisedDepthCompletion'], + } + + import sys + + sys.modules[__name__] = LazyImportModule( + __name__, + globals()['__file__'], + _import_structure, + module_spec=__spec__, + extra_objects={}, + ) diff --git a/modelscope/models/cv/self_supervised_depth_completion/criteria.py b/modelscope/models/cv/self_supervised_depth_completion/criteria.py new file mode 100644 index 000000000..d221ae58b --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/criteria.py @@ -0,0 +1,98 @@ +import torch +import torch.nn as nn + +from modelscope.utils.logger import get_logger + +logger = get_logger() + +loss_names = ['l1', 'l2'] + + +class MaskedMSELoss(nn.Module): + + def __init__(self): + super(MaskedMSELoss, self).__init__() + + def forward(self, pred, target): + assert pred.dim() == target.dim(), 'inconsistent dimensions' + valid_mask = (target > 0).detach() + diff = target - pred + diff = diff[valid_mask] + self.loss = (diff**2).mean() + return self.loss + + +class MaskedL1Loss(nn.Module): + + def __init__(self): + super(MaskedL1Loss, self).__init__() + + def forward(self, pred, target, weight=None): + assert pred.dim() == target.dim(), 'inconsistent dimensions' + valid_mask = (target > 0).detach() + diff = target - pred + diff = diff[valid_mask] + self.loss = diff.abs().mean() + return self.loss + + +class PhotometricLoss(nn.Module): + + def __init__(self): + super(PhotometricLoss, self).__init__() + + def forward(self, target, recon, mask=None): + + assert recon.dim( + ) == 4, 'expected recon dimension to be 4, but instead got {}.'.format( + recon.dim()) + assert target.dim( + ) == 4, 'expected target dimension to be 4, but instead got {}.'.format( + target.dim()) + assert recon.size() == target.size(), 'expected recon and target to have the same size, but got {} and {} '\ + .format(recon.size(), target.size()) + diff = (target - recon).abs() + diff = torch.sum(diff, 1) # sum along the color channel + + # compare only pixels that are not black + valid_mask = (torch.sum(recon, 1) > 0).float() * (torch.sum(target, 1) + > 0).float() + if mask is not None: + valid_mask = valid_mask * torch.squeeze(mask).float() + valid_mask = valid_mask.byte().detach() + if valid_mask.numel() > 0: + diff = diff[valid_mask] + if diff.nelement() > 0: + self.loss = diff.mean() + else: + logger.info( + 'warning: diff.nelement()==0 in PhotometricLoss (this is expected during early stage of training, \ + try larger batch size).') + self.loss = 0 + else: + logger.info('warning: 0 valid pixel in PhotometricLoss') + self.loss = 0 + return self.loss + + +class SmoothnessLoss(nn.Module): + + def __init__(self): + super(SmoothnessLoss, self).__init__() + + def forward(self, depth): + + def second_derivative(x): + assert x.dim( + ) == 4, 'expected 4-dimensional data, but instead got {}'.format( + x.dim()) + horizontal = 2 * x[:, :, 1:-1, 1:-1] - x[:, :, + 1:-1, :-2] - x[:, :, 1:-1, + 2:] + vertical = 2 * x[:, :, 1:-1, 1:-1] - x[:, :, :-2, + 1:-1] - x[:, :, 2:, 1:-1] + der_2nd = horizontal.abs() + vertical.abs() + return der_2nd.mean() + + self.loss = second_derivative(depth) + return self.loss diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/__init__.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/kitti_loader.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/kitti_loader.py new file mode 100644 index 000000000..937be3bfb --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/kitti_loader.py @@ -0,0 +1,344 @@ +import glob +import os +import os.path +from random import choice + +import cv2 +import numpy as np +import torch.utils.data as data +from numpy import linalg as LA +from PIL import Image + +from modelscope.models.cv.self_supervised_depth_completion.dataloaders import \ + transforms +from modelscope.models.cv.self_supervised_depth_completion.dataloaders.pose_estimator import \ + get_pose_pnp + +input_options = ['d', 'rgb', 'rgbd', 'g', 'gd'] + + +def load_calib(args): + """ + Temporarily hardcoding the calibration matrix using calib file from 2011_09_26 + """ + calib = open(os.path.join(args.data_folder, 'calib_cam_to_cam.txt'), 'r') + lines = calib.readlines() + P_rect_line = lines[25] + + Proj_str = P_rect_line.split(':')[1].split(' ')[1:] + Proj = np.reshape(np.array([float(p) for p in Proj_str]), + (3, 4)).astype(np.float32) + K = Proj[:3, :3] # camera matrix + + # note: we will take the center crop of the images during augmentation + # that changes the optical centers, but not focal lengths + K[0, 2] = K[ + 0, + 2] - 13 # from width = 1242 to 1216, with a 13-pixel cut on both sides + K[1, 2] = K[ + 1, + 2] - 11.5 # from width = 375 to 352, with a 11.5-pixel cut on both sides + return K + + +def get_paths_and_transform(split, args): + assert (args.use_d or args.use_rgb + or args.use_g), 'no proper input selected' + + if split == 'train': + transform = train_transform + glob_d = os.path.join( + args.data_folder, + 'data_depth_velodyne/train/*_sync/proj_depth/velodyne_raw/image_0[2,3]/*.png' + ) + glob_gt = os.path.join( + args.data_folder, + 'data_depth_annotated/train/*_sync/proj_depth/groundtruth/image_0[2,3]/*.png' + ) + + def get_rgb_paths(p): + ps = p.split('/') + pnew = '/'.join([args.data_folder] + ['data_rgb'] + ps[-6:-4] + + ps[-2:-1] + ['data'] + ps[-1:]) + return pnew + elif split == 'val': + if args.val == 'full': + transform = val_transform + glob_d = os.path.join( + args.data_folder, + 'data_depth_velodyne/val/*_sync/proj_depth/velodyne_raw/image_0[2,3]/*.png' + ) + glob_gt = os.path.join( + args.data_folder, + 'data_depth_annotated/val/*_sync/proj_depth/groundtruth/image_0[2,3]/*.png' + ) + + def get_rgb_paths(p): + ps = p.split('/') + pnew = '/'.join(ps[:-7] + ['data_rgb '] + ps[-6:-4] + ps[-2:-1] + + ['data'] + ps[-1:]) + return pnew + elif args.val == 'select': + transform = no_transform + glob_d = os.path.join( + args.data_folder, + 'depth_selection/val_selection_cropped/velodyne_raw/*.png') + glob_gt = os.path.join( + args.data_folder, + 'depth_selection/val_selection_cropped/groundtruth_depth/*.png' + ) + + def get_rgb_paths(p): + return p.replace('groundtruth_depth', 'image') + elif split == 'test_completion': + transform = no_transform + glob_d = os.path.join( + args.data_folder, + 'depth_selection/test_depth_completion_anonymous/velodyne_raw/*.png' + ) + glob_gt = None # "test_depth_completion_anonymous/" + glob_rgb = os.path.join( + args.data_folder, + 'depth_selection/test_depth_completion_anonymous/image/*.png') + elif split == 'test_prediction': + transform = no_transform + glob_d = None + glob_gt = None # "test_depth_completion_anonymous/" + glob_rgb = os.path.join( + args.data_folder, + 'depth_selection/test_depth_prediction_anonymous/image/*.png') + else: + raise ValueError('Unrecognized split ' + str(split)) + + if glob_gt is not None: + # train or val-full or val-select + paths_d = sorted(glob.glob(glob_d)) + paths_gt = sorted(glob.glob(glob_gt)) + paths_rgb = [get_rgb_paths(p) for p in paths_gt] + else: + # test only has d or rgb + paths_rgb = sorted(glob.glob(glob_rgb)) + paths_gt = [None] * len(paths_rgb) + if split == 'test_prediction': + paths_d = [None] * len( + paths_rgb) # test_prediction has no sparse depth + else: + paths_d = sorted(glob.glob(glob_d)) + + if len(paths_d) == 0 and len(paths_rgb) == 0 and len(paths_gt) == 0: + raise (RuntimeError('Found 0 images under {}'.format(glob_gt))) + if len(paths_d) == 0 and args.use_d: + raise (RuntimeError('Requested sparse depth but none was found')) + if len(paths_rgb) == 0 and args.use_rgb: + raise (RuntimeError('Requested rgb images but none was found')) + if len(paths_rgb) == 0 and args.use_g: + raise (RuntimeError('Requested gray images but no rgb was found')) + if len(paths_rgb) != len(paths_d) or len(paths_rgb) != len(paths_gt): + raise (RuntimeError('Produced different sizes for datasets')) + + paths = {'rgb': paths_rgb, 'd': paths_d, 'gt': paths_gt} + return paths, transform + + +def rgb_read(filename): + assert os.path.exists(filename), 'file not found: {}'.format(filename) + img_file = Image.open(filename) + # rgb_png = np.array(img_file, dtype=float) / 255.0 # scale pixels to the range [0,1] + rgb_png = np.array(img_file, dtype='uint8') # in the range [0,255] + img_file.close() + return rgb_png + + +def depth_read(filename): + # loads depth map D from png file + # and returns it as a numpy array, + # for details see readme.txt + assert os.path.exists(filename), 'file not found: {}'.format(filename) + img_file = Image.open(filename) + depth_png = np.array(img_file, dtype=int) + img_file.close() + # make sure we have a proper 16bit depth map here.. not 8bit! + assert np.max(depth_png) > 255, \ + 'np.max(depth_png)={}, path={}'.format(np.max(depth_png), filename) + + depth = depth_png.astype(float) / 256. + # depth[depth_png == 0] = -1. + depth = np.expand_dims(depth, -1) + return depth + + +oheight, owidth = 352, 1216 + + +def drop_depth_measurements(depth, prob_keep): + mask = np.random.binomial(1, prob_keep, depth.shape) + depth *= mask + return depth + + +def train_transform(rgb, sparse, target, rgb_near, args): + # s = np.random.uniform(1.0, 1.5) # random scaling + # angle = np.random.uniform(-5.0, 5.0) # random rotation degrees + do_flip = np.random.uniform(0.0, 1.0) < 0.5 # random horizontal flip + + transform_geometric = transforms.Compose([ + # transforms.Rotate(angle), + # transforms.Resize(s), + transforms.BottomCrop((oheight, owidth)), + transforms.HorizontalFlip(do_flip) + ]) + if sparse is not None: + sparse = transform_geometric(sparse) + target = transform_geometric(target) + if rgb is not None: + brightness = np.random.uniform( + max(0, 1 - args.jitter), 1 + args.jitter) + contrast = np.random.uniform(max(0, 1 - args.jitter), 1 + args.jitter) + saturation = np.random.uniform( + max(0, 1 - args.jitter), 1 + args.jitter) + transform_rgb = transforms.Compose([ + transforms.ColorJitter(brightness, contrast, saturation, 0), + transform_geometric + ]) + rgb = transform_rgb(rgb) + if rgb_near is not None: + rgb_near = transform_rgb(rgb_near) + # sparse = drop_depth_measurements(sparse, 0.9) + + return rgb, sparse, target, rgb_near + + +def val_transform(rgb, sparse, target, rgb_near, args): + transform = transforms.Compose([ + transforms.BottomCrop((oheight, owidth)), + ]) + if rgb is not None: + rgb = transform(rgb) + if sparse is not None: + sparse = transform(sparse) + if target is not None: + target = transform(target) + if rgb_near is not None: + rgb_near = transform(rgb_near) + return rgb, sparse, target, rgb_near + + +def no_transform(rgb, sparse, target, rgb_near, args): + return rgb, sparse, target, rgb_near + + +to_tensor = transforms.ToTensor() + + +def to_float_tensor(x): + return to_tensor(x).float() + + +def handle_gray(rgb, args): + if rgb is None: + return None, None + if not args.use_g: + return rgb, None + else: + img = np.array(Image.fromarray(rgb).convert('L')) + img = np.expand_dims(img, -1) + if not args.use_rgb: + rgb_ret = None + else: + rgb_ret = rgb + return rgb_ret, img + + +def get_rgb_near(path, args): + assert path is not None, 'path is None' + + def extract_frame_id(filename): + head, tail = os.path.split(filename) + number_string = tail[0:tail.find('.')] + number = int(number_string) + return head, number + + def get_nearby_filename(filename, new_id): + head, _ = os.path.split(filename) + new_filename = os.path.join(head, '%010d.png' % new_id) + return new_filename + + head, number = extract_frame_id(path) + count = 0 + max_frame_diff = 3 + candidates = [ + i - max_frame_diff for i in range(max_frame_diff * 2 + 1) + if i - max_frame_diff != 0 + ] + while True: + random_offset = choice(candidates) + path_near = get_nearby_filename(path, number + random_offset) + if os.path.exists(path_near): + break + assert count < 20, 'cannot find a nearby frame in 20 trials for {}'.format( + path) + count += 1 + + return rgb_read(path_near) + + +class KittiDepth(data.Dataset): + """A data loader for the Kitti dataset + """ + + def __init__(self, split, args): + self.args = args + self.split = split + paths, transform = get_paths_and_transform(split, args) + self.paths = paths + self.transform = transform + self.K = load_calib(args) + self.threshold_translation = 0.1 + + def __getraw__(self, index): + rgb = rgb_read(self.paths['rgb'][index]) if \ + (self.paths['rgb'][index] is not None and (self.args.use_rgb or self.args.use_g)) else None + sparse = depth_read(self.paths['d'][index]) if \ + (self.paths['d'][index] is not None and self.args.use_d) else None + target = depth_read(self.paths['gt'][index]) if \ + self.paths['gt'][index] is not None else None + rgb_near = get_rgb_near(self.paths['rgb'][index], self.args) if \ + self.split == 'train' and self.args.use_pose else None + return rgb, sparse, target, rgb_near + + def __getitem__(self, index): + rgb, sparse, target, rgb_near = self.__getraw__(index) + rgb, sparse, target, rgb_near = self.transform(rgb, sparse, target, + rgb_near, self.args) + r_mat, t_vec = None, None + if self.split == 'train' and self.args.use_pose: + success, r_vec, t_vec = get_pose_pnp(rgb, rgb_near, sparse, self.K) + # discard if translation is too small + success = success and LA.norm(t_vec) > self.threshold_translation + if success: + r_mat, _ = cv2.Rodrigues(r_vec) + else: + # return the same image and no motion when PnP fails + rgb_near = rgb + t_vec = np.zeros((3, 1)) + r_mat = np.eye(3) + + rgb, gray = handle_gray(rgb, self.args) + candidates = { + 'rgb': rgb, + 'd': sparse, + 'gt': target, + 'g': gray, + 'r_mat': r_mat, + 't_vec': t_vec, + 'rgb_near': rgb_near + } + items = { + key: to_float_tensor(val) + for key, val in candidates.items() if val is not None + } + + return items + + def __len__(self): + return len(self.paths['gt']) diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/pose_estimator.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/pose_estimator.py new file mode 100644 index 000000000..996725bf1 --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/pose_estimator.py @@ -0,0 +1,102 @@ +import cv2 +import numpy as np + + +def rgb2gray(rgb): + return np.dot(rgb[..., :3], [0.299, 0.587, 0.114]) + + +def convert_2d_to_3d(u, v, z, K): + v0 = K[1][2] + u0 = K[0][2] + fy = K[1][1] + fx = K[0][0] + x = (u - u0) * z / fx + y = (v - v0) * z / fy + return (x, y, z) + + +def feature_match(img1, img2): + r''' Find features on both images and match them pairwise + ''' + max_n_features = 1000 + # max_n_features = 500 + use_flann = False # better not use flann + + detector = cv2.xfeatures2d.SIFT_create(max_n_features) + + # find the keypoints and descriptors with SIFT + kp1, des1 = detector.detectAndCompute(img1, None) + kp2, des2 = detector.detectAndCompute(img2, None) + if (des1 is None) or (des2 is None): + return [], [] + des1 = des1.astype(np.float32) + des2 = des2.astype(np.float32) + + if use_flann: + # FLANN parameters + FLANN_INDEX_KDTREE = 0 + index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5) + search_params = dict(checks=50) + flann = cv2.FlannBasedMatcher(index_params, search_params) + matches = flann.knnMatch(des1, des2, k=2) + else: + matcher = cv2.DescriptorMatcher().create('BruteForce') + matches = matcher.knnMatch(des1, des2, k=2) + + good = [] + pts1 = [] + pts2 = [] + # ratio test as per Lowe's paper + for i, (m, n) in enumerate(matches): + if m.distance < 0.8 * n.distance: + good.append(m) + pts2.append(kp2[m.trainIdx].pt) + pts1.append(kp1[m.queryIdx].pt) + + pts1 = np.int32(pts1) + pts2 = np.int32(pts2) + return pts1, pts2 + + +def get_pose_pnp(rgb_curr, rgb_near, depth_curr, K): + gray_curr = rgb2gray(rgb_curr).astype(np.uint8) + gray_near = rgb2gray(rgb_near).astype(np.uint8) + height, width = gray_curr.shape + + pts2d_curr, pts2d_near = feature_match(gray_curr, + gray_near) # feature matching + + # dilation of depth + kernel = np.ones((4, 4), np.uint8) + depth_curr_dilated = cv2.dilate(depth_curr, kernel) + + # extract 3d pts + pts3d_curr = [] + pts2d_near_filtered = [ + ] # keep only feature points with depth in the current frame + for i, pt2d in enumerate(pts2d_curr): + # print(pt2d) + u, v = pt2d[0], pt2d[1] + z = depth_curr_dilated[v, u] + if z > 0: + xyz_curr = convert_2d_to_3d(u, v, z, K) + pts3d_curr.append(xyz_curr) + pts2d_near_filtered.append(pts2d_near[i]) + + # the minimal number of points accepted by solvePnP is 4: + if len(pts3d_curr) >= 4 and len(pts2d_near_filtered) >= 4: + pts3d_curr = np.expand_dims( + np.array(pts3d_curr).astype(np.float32), axis=1) + pts2d_near_filtered = np.expand_dims( + np.array(pts2d_near_filtered).astype(np.float32), axis=1) + + # ransac + ret = cv2.solvePnPRansac( + pts3d_curr, pts2d_near_filtered, K, distCoeffs=None) + success = ret[0] + rotation_vector = ret[1] + translation_vector = ret[2] + return (success, rotation_vector, translation_vector) + else: + return (0, None, None) diff --git a/modelscope/models/cv/self_supervised_depth_completion/dataloaders/transforms.py b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/transforms.py new file mode 100644 index 000000000..2d4cab3c6 --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/dataloaders/transforms.py @@ -0,0 +1,617 @@ +from __future__ import division +import numbers +import types + +import numpy as np +import scipy.ndimage.interpolation as itpl +import skimage.transform +import torch +from PIL import Image, ImageEnhance + +try: + import accimage +except ImportError: + accimage = None + + +def _is_numpy_image(img): + return isinstance(img, np.ndarray) and (img.ndim in {2, 3}) + + +def _is_pil_image(img): + if accimage is not None: + return isinstance(img, (Image.Image, accimage.Image)) + else: + return isinstance(img, Image.Image) + + +def _is_tensor_image(img): + return torch.is_tensor(img) and img.ndimension() == 3 + + +def adjust_brightness(img, brightness_factor): + """Adjust brightness of an Image. + + Args: + img (PIL Image): PIL Image to be adjusted. + brightness_factor (float): How much to adjust the brightness. Can be + any non negative number. 0 gives a black image, 1 gives the + original image while 2 increases the brightness by a factor of 2. + + Returns: + PIL Image: Brightness adjusted image. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + enhancer = ImageEnhance.Brightness(img) + img = enhancer.enhance(brightness_factor) + return img + + +def adjust_contrast(img, contrast_factor): + """Adjust contrast of an Image. + + Args: + img (PIL Image): PIL Image to be adjusted. + contrast_factor (float): How much to adjust the contrast. Can be any + non negative number. 0 gives a solid gray image, 1 gives the + original image while 2 increases the contrast by a factor of 2. + + Returns: + PIL Image: Contrast adjusted image. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + enhancer = ImageEnhance.Contrast(img) + img = enhancer.enhance(contrast_factor) + return img + + +def adjust_saturation(img, saturation_factor): + """Adjust color saturation of an image. + + Args: + img (PIL Image): PIL Image to be adjusted. + saturation_factor (float): How much to adjust the saturation. 0 will + give a black and white image, 1 will give the original image while + 2 will enhance the saturation by a factor of 2. + + Returns: + PIL Image: Saturation adjusted image. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + enhancer = ImageEnhance.Color(img) + img = enhancer.enhance(saturation_factor) + return img + + +def adjust_hue(img, hue_factor): + """Adjust hue of an image. + + The image hue is adjusted by converting the image to HSV and + cyclically shifting the intensities in the hue channel (H). + The image is then converted back to original image mode. + + `hue_factor` is the amount of shift in H channel and must be in the + interval `[-0.5, 0.5]`. + + See https://en.wikipedia.org/wiki/Hue for more details on Hue. + + Args: + img (PIL Image): PIL Image to be adjusted. + hue_factor (float): How much to shift the hue channel. Should be in + [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in + HSV space in positive and negative direction respectively. + 0 means no shift. Therefore, both -0.5 and 0.5 will give an image + with complementary colors while 0 gives the original image. + + Returns: + PIL Image: Hue adjusted image. + """ + if not (-0.5 <= hue_factor <= 0.5): + raise ValueError( + 'hue_factor is not in [-0.5, 0.5]. Got {}'.format(hue_factor)) + + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + input_mode = img.mode + if input_mode in {'L', '1', 'I', 'F'}: + return img + + h, s, v = img.convert('HSV').split() + + np_h = np.array(h, dtype=np.uint8) + # uint8 addition take cares of rotation across boundaries + with np.errstate(over='ignore'): + np_h += np.uint8(hue_factor * 255) + h = Image.fromarray(np_h, 'L') + + img = Image.merge('HSV', (h, s, v)).convert(input_mode) + return img + + +def adjust_gamma(img, gamma, gain=1): + """Perform gamma correction on an image. + + Also known as Power Law Transform. Intensities in RGB mode are adjusted + based on the following equation: + + I_out = 255 * gain * ((I_in / 255) ** gamma) + + See https://en.wikipedia.org/wiki/Gamma_correction for more details. + + Args: + img (PIL Image): PIL Image to be adjusted. + gamma (float): Non negative real number. gamma larger than 1 make the + shadows darker, while gamma smaller than 1 make dark regions + lighter. + gain (float): The constant multiplier. + """ + if not _is_pil_image(img): + raise TypeError('img should be PIL Image. Got {}'.format(type(img))) + + if gamma < 0: + raise ValueError('Gamma should be a non-negative real number') + + input_mode = img.mode + img = img.convert('RGB') + + np_img = np.array(img, dtype=np.float32) + np_img = 255 * gain * ((np_img / 255)**gamma) + np_img = np.uint8(np.clip(np_img, 0, 255)) + + img = Image.fromarray(np_img, 'RGB').convert(input_mode) + return img + + +class Compose(object): + """Composes several transforms together. + + Args: + transforms (list of ``Transform`` objects): list of transforms to compose. + + Example: + >>> transforms.Compose([ + >>> transforms.CenterCrop(10), + >>> transforms.ToTensor(), + >>> ]) + """ + + def __init__(self, transforms): + self.transforms = transforms + + def __call__(self, img): + for t in self.transforms: + img = t(img) + return img + + +class ToTensor(object): + """Convert a ``numpy.ndarray`` to tensor. + + Converts a numpy.ndarray (H x W x C) to a torch.FloatTensor of shape (C x H x W). + """ + + def __call__(self, img): + """Convert a ``numpy.ndarray`` to tensor. + + Args: + img (numpy.ndarray): Image to be converted to tensor. + + Returns: + Tensor: Converted image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + + if isinstance(img, np.ndarray): + # handle numpy array + if img.ndim == 3: + img = torch.from_numpy(img.transpose((2, 0, 1)).copy()) + elif img.ndim == 2: + img = torch.from_numpy(img.copy()) + else: + raise RuntimeError( + 'img should be ndarray with 2 or 3 dimensions. Got {}'. + format(img.ndim)) + + return img + + +class NormalizeNumpyArray(object): + """Normalize a ``numpy.ndarray`` with mean and standard deviation. + Given mean: ``(M1,...,Mn)`` and std: ``(M1,..,Mn)`` for ``n`` channels, this transform + will normalize each channel of the input ``numpy.ndarray`` i.e. + ``input[channel] = (input[channel] - mean[channel]) / std[channel]`` + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + """ + + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, img): + """ + Args: + img (numpy.ndarray): Image of size (H, W, C) to be normalized. + + Returns: + Tensor: Normalized image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + # TODO: make efficient + # print(img.shape) + for i in range(3): + img[:, :, i] = (img[:, :, i] - self.mean[i]) / self.std[i] + return img + + +class NormalizeTensor(object): + """Normalize an tensor image with mean and standard deviation. + Given mean: ``(M1,...,Mn)`` and std: ``(M1,..,Mn)`` for ``n`` channels, this transform + will normalize each channel of the input ``torch.*Tensor`` i.e. + ``input[channel] = (input[channel] - mean[channel]) / std[channel]`` + + Args: + mean (sequence): Sequence of means for each channel. + std (sequence): Sequence of standard deviations for each channel. + """ + + def __init__(self, mean, std): + self.mean = mean + self.std = std + + def __call__(self, tensor): + """ + Args: + tensor (Tensor): Tensor image of size (C, H, W) to be normalized. + + Returns: + Tensor: Normalized Tensor image. + """ + if not _is_tensor_image(tensor): + raise TypeError('tensor is not a torch image.') + # TODO: make efficient + for t, m, s in zip(tensor, self.mean, self.std): + t.sub_(m).div_(s) + return tensor + + +class Rotate(object): + """Rotates the given ``numpy.ndarray``. + + Args: + angle (float): The rotation angle in degrees. + """ + + def __init__(self, angle): + self.angle = angle + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Image to be rotated. + + Returns: + img (numpy.ndarray (C x H x W)): Rotated image. + """ + + # order=0 means nearest-neighbor type interpolation + return skimage.transform.rotate(img, self.angle, resize=False, order=0) + + +class Resize(object): + """Resize the the given ``numpy.ndarray`` to the given size. + Args: + size (sequence or int): Desired output size. If size is a sequence like + (h, w), output size will be matched to this. If size is an int, + smaller edge of the image will be matched to this number. + i.e, if height > width, then image will be rescaled to + (size * height / width, size) + interpolation (int, optional): Desired interpolation. Default is + ``PIL.Image.BILINEAR`` + """ + + def __init__(self, size, interpolation='nearest'): + assert isinstance(size, float) + self.size = size + self.interpolation = interpolation + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Image to be scaled. + Returns: + img (numpy.ndarray (C x H x W)): Rescaled image. + """ + if img.ndim == 3: + return skimage.transform.rescale(img, self.size, order=0) + elif img.ndim == 2: + return skimage.transform.rescale(img, self.size, order=0) + else: + RuntimeError( + 'img should be ndarray with 2 or 3 dimensions. Got {}'.format( + img.ndim)) + + +class CenterCrop(object): + """Crops the given ``numpy.ndarray`` at the center. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + """ + + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + @staticmethod + def get_params(img, output_size): + """Get parameters for ``crop`` for center crop. + + Args: + img (numpy.ndarray (C x H x W)): Image to be cropped. + output_size (tuple): Expected output size of the crop. + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for center crop. + """ + h = img.shape[0] + w = img.shape[1] + th, tw = output_size + i = int(round((h - th) / 2.)) + j = int(round((w - tw) / 2.)) + + # # randomized cropping + # i = np.random.randint(i-3, i+4) + # j = np.random.randint(j-3, j+4) + + return i, j, th, tw + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Image to be cropped. + + Returns: + img (numpy.ndarray (C x H x W)): Cropped image. + """ + i, j, h, w = self.get_params(img, self.size) + """ + i: Upper pixel coordinate. + j: Left pixel coordinate. + h: Height of the cropped image. + w: Width of the cropped image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + if img.ndim == 3: + return img[i:i + h, j:j + w, :] + elif img.ndim == 2: + return img[i:i + h, j:j + w] + else: + raise RuntimeError( + 'img should be ndarray with 2 or 3 dimensions. Got {}'.format( + img.ndim)) + + +class BottomCrop(object): + """Crops the given ``numpy.ndarray`` at the bottom. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + """ + + def __init__(self, size): + if isinstance(size, numbers.Number): + self.size = (int(size), int(size)) + else: + self.size = size + + @staticmethod + def get_params(img, output_size): + """Get parameters for ``crop`` for bottom crop. + + Args: + img (numpy.ndarray (C x H x W)): Image to be cropped. + output_size (tuple): Expected output size of the crop. + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for bottom crop. + """ + h = img.shape[0] + w = img.shape[1] + th, tw = output_size + i = h - th + j = int(round((w - tw) / 2.)) + + # randomized left and right cropping + # i = np.random.randint(i-3, i+4) + # j = np.random.randint(j-1, j+1) + + return i, j, th, tw + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Image to be cropped. + + Returns: + img (numpy.ndarray (C x H x W)): Cropped image. + """ + i, j, h, w = self.get_params(img, self.size) + """ + i: Upper pixel coordinate. + j: Left pixel coordinate. + h: Height of the cropped image. + w: Width of the cropped image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + if img.ndim == 3: + return img[i:i + h, j:j + w, :] + elif img.ndim == 2: + return img[i:i + h, j:j + w] + else: + raise RuntimeError( + 'img should be ndarray with 2 or 3 dimensions. Got {}'.format( + img.ndim)) + + +class Crop(object): + """Crops the given ``numpy.ndarray`` at the center. + + Args: + size (sequence or int): Desired output size of the crop. If size is an + int instead of sequence like (h, w), a square crop (size, size) is + made. + """ + + def __init__(self, crop): + self.crop = crop + + @staticmethod + def get_params(img, crop): + """Get parameters for ``crop`` for center crop. + + Args: + img (numpy.ndarray (C x H x W)): Image to be cropped. + output_size (tuple): Expected output size of the crop. + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for center crop. + """ + x_l, x_r, y_b, y_t = crop + h = img.shape[0] + w = img.shape[1] + assert x_l >= 0 and x_l < w + assert x_r >= 0 and x_r < w + assert y_b >= 0 and y_b < h + assert y_t >= 0 and y_t < h + assert x_l < x_r and y_b < y_t + + return x_l, x_r, y_b, y_t + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Image to be cropped. + + Returns: + img (numpy.ndarray (C x H x W)): Cropped image. + """ + x_l, x_r, y_b, y_t = self.get_params(img, self.crop) + """ + i: Upper pixel coordinate. + j: Left pixel coordinate. + h: Height of the cropped image. + w: Width of the cropped image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + if img.ndim == 3: + return img[y_b:y_t, x_l:x_r, :] + elif img.ndim == 2: + return img[y_b:y_t, x_l:x_r] + else: + raise RuntimeError( + 'img should be ndarray with 2 or 3 dimensions. Got {}'.format( + img.ndim)) + + +class Lambda(object): + """Apply a user-defined lambda as a transform. + + Args: + lambd (function): Lambda/function to be used for transform. + """ + + def __init__(self, lambd): + assert isinstance(lambd, types.LambdaType) + self.lambd = lambd + + def __call__(self, img): + return self.lambd(img) + + +class HorizontalFlip(object): + """Horizontally flip the given ``numpy.ndarray``. + + Args: + do_flip (boolean): whether or not do horizontal flip. + + """ + + def __init__(self, do_flip): + self.do_flip = do_flip + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Image to be flipped. + + Returns: + img (numpy.ndarray (C x H x W)): flipped image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + + if self.do_flip: + return np.fliplr(img) + else: + return img + + +class ColorJitter(object): + """Randomly change the brightness, contrast and saturation of an image. + + Args: + brightness (float): How much to jitter brightness. brightness_factor + is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]. + contrast (float): How much to jitter contrast. contrast_factor + is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]. + saturation (float): How much to jitter saturation. saturation_factor + is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. + hue(float): How much to jitter hue. hue_factor is chosen uniformly from + [-hue, hue]. Should be >=0 and <= 0.5. + """ + + def __init__(self, brightness=0, contrast=0, saturation=0, hue=0): + transforms = [] + transforms.append( + Lambda(lambda img: adjust_brightness(img, brightness))) + transforms.append(Lambda(lambda img: adjust_contrast(img, contrast))) + transforms.append( + Lambda(lambda img: adjust_saturation(img, saturation))) + transforms.append(Lambda(lambda img: adjust_hue(img, hue))) + np.random.shuffle(transforms) + self.transform = Compose(transforms) + + def __call__(self, img): + """ + Args: + img (numpy.ndarray (C x H x W)): Input image. + + Returns: + img (numpy.ndarray (C x H x W)): Color jittered image. + """ + if not (_is_numpy_image(img)): + raise TypeError('img should be ndarray. Got {}'.format(type(img))) + + pil = Image.fromarray(img) + return np.array(self.transform(pil)) diff --git a/modelscope/models/cv/self_supervised_depth_completion/helper.py b/modelscope/models/cv/self_supervised_depth_completion/helper.py new file mode 100644 index 000000000..5a9069bdc --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/helper.py @@ -0,0 +1,269 @@ +import csv +import os +import shutil +import time + +import torch + +from modelscope.models.cv.self_supervised_depth_completion import vis_utils +from modelscope.models.cv.self_supervised_depth_completion.metrics import \ + Result + +fieldnames = [ + 'epoch', 'rmse', 'photo', 'mae', 'irmse', 'imae', 'mse', 'absrel', 'lg10', + 'silog', 'squared_rel', 'delta1', 'delta2', 'delta3', 'data_time', + 'gpu_time' +] + + +class logger: + + def __init__(self, args, prepare=True): + self.args = args + output_directory = get_folder_name(args) + self.output_directory = output_directory + self.best_result = Result() + self.best_result.set_to_worst() + + if not prepare: + return + if not os.path.exists(output_directory): + os.makedirs(output_directory) + self.train_csv = os.path.join(output_directory, 'train.csv') + self.val_csv = os.path.join(output_directory, 'val.csv') + self.best_txt = os.path.join(output_directory, 'best.txt') + + # backup the source code + if args.resume == '': + print('=> creating source code backup ...') + backup_directory = os.path.join(output_directory, 'code_backup') + self.backup_directory = backup_directory + # backup_source_code(backup_directory) + # create new csv files with only header + with open(self.train_csv, 'w') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + with open(self.val_csv, 'w') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writeheader() + print('=> finished creating source code backup.') + + def conditional_print(self, split, i, epoch, lr, n_set, blk_avg_meter, + avg_meter): + if (i + 1) % self.args.print_freq == 0: + avg = avg_meter.average() + blk_avg = blk_avg_meter.average() + print('=> output: {}'.format(self.output_directory)) + print( + '{split} Epoch: {0} [{1}/{2}]\tlr={lr} ' + 't_Data={blk_avg.data_time:.3f}({average.data_time:.3f}) ' + 't_GPU={blk_avg.gpu_time:.3f}({average.gpu_time:.3f})\n\t' + 'RMSE={blk_avg.rmse:.2f}({average.rmse:.2f}) ' + 'MAE={blk_avg.mae:.2f}({average.mae:.2f}) ' + 'iRMSE={blk_avg.irmse:.2f}({average.irmse:.2f}) ' + 'iMAE={blk_avg.imae:.2f}({average.imae:.2f})\n\t' + 'silog={blk_avg.silog:.2f}({average.silog:.2f}) ' + 'squared_rel={blk_avg.squared_rel:.2f}({average.squared_rel:.2f}) ' + 'Delta1={blk_avg.delta1:.3f}({average.delta1:.3f}) ' + 'REL={blk_avg.absrel:.3f}({average.absrel:.3f})\n\t' + 'Lg10={blk_avg.lg10:.3f}({average.lg10:.3f}) ' + 'Photometric={blk_avg.photometric:.3f}({average.photometric:.3f}) ' + .format( + epoch, + i + 1, + n_set, + lr=lr, + blk_avg=blk_avg, + average=avg, + split=split.capitalize())) + blk_avg_meter.reset() + + def conditional_save_info(self, split, average_meter, epoch): + avg = average_meter.average() + if split == 'train': + csvfile_name = self.train_csv + elif split == 'val': + csvfile_name = self.val_csv + elif split == 'eval': + eval_filename = os.path.join(self.output_directory, 'eval.txt') + self.save_single_txt(eval_filename, avg, epoch) + return avg + elif 'test' in split: + return avg + else: + raise ValueError('wrong split provided to logger') + with open(csvfile_name, 'a') as csvfile: + writer = csv.DictWriter(csvfile, fieldnames=fieldnames) + writer.writerow({ + 'epoch': epoch, + 'rmse': avg.rmse, + 'photo': avg.photometric, + 'mae': avg.mae, + 'irmse': avg.irmse, + 'imae': avg.imae, + 'mse': avg.mse, + 'silog': avg.silog, + 'squared_rel': avg.squared_rel, + 'absrel': avg.absrel, + 'lg10': avg.lg10, + 'delta1': avg.delta1, + 'delta2': avg.delta2, + 'delta3': avg.delta3, + 'gpu_time': avg.gpu_time, + 'data_time': avg.data_time + }) + return avg + + def save_single_txt(self, filename, result, epoch): + with open(filename, 'w') as txtfile: + txtfile.write( + ('rank_metric={}\n' + 'epoch={}\n' + 'rmse={:.3f}\n' + + 'mae={:.3f}\n' + 'silog={:.3f}\n' + 'squared_rel={:.3f}\n' + + 'irmse={:.3f}\n' + 'imae={:.3f}\n' + 'mse={:.3f}\n' + + 'absrel={:.3f}\n' + 'lg10={:.3f}\n' + + 'delta1={:.3f}\n' + 't_gpu={:.4f}').format( + self.args.rank_metric, epoch, result.rmse, result.mae, + result.silog, result.squared_rel, result.irmse, + result.imae, result.mse, result.absrel, result.lg10, + result.delta1, result.gpu_time)) + + def save_best_txt(self, result, epoch): + self.save_single_txt(self.best_txt, result, epoch) + + def _get_img_comparison_name(self, mode, epoch, is_best=False): + if mode == 'eval': + return self.output_directory + '/comparison_eval.png' + if mode == 'val': + if is_best: + return self.output_directory + '/comparison_best.png' + else: + return self.output_directory + '/comparison_' + str( + epoch) + '.png' + + def conditional_save_img_comparison(self, mode, i, ele, pred, epoch): + # save 8 images for visualization + if mode == 'val' or mode == 'eval': + skip = 100 + if i == 0: + self.img_merge = vis_utils.merge_into_row(ele, pred) + elif i % skip == 0 and i < 8 * skip: + row = vis_utils.merge_into_row(ele, pred) + self.img_merge = vis_utils.add_row(self.img_merge, row) + elif i == 8 * skip: + filename = self._get_img_comparison_name(mode, epoch) + vis_utils.save_image(self.img_merge, filename) + return self.img_merge + + def save_img_comparison_as_best(self, mode, epoch): + if mode == 'val': + filename = self._get_img_comparison_name(mode, epoch, is_best=True) + vis_utils.save_image(self.img_merge, filename) + + def get_ranking_error(self, result): + return getattr(result, self.args.rank_metric) + + def rank_conditional_save_best(self, mode, result, epoch): + error = self.get_ranking_error(result) + best_error = self.get_ranking_error(self.best_result) + is_best = error < best_error + if is_best and mode == 'val': + self.old_best_result = self.best_result + self.best_result = result + self.save_best_txt(result, epoch) + return is_best + + def conditional_save_pred(self, mode, i, pred, epoch): + if ('test' in mode or mode == 'eval') and self.args.save_pred: + + # save images for visualization/ testing + image_folder = os.path.join(self.output_directory, + mode + '_output') + if not os.path.exists(image_folder): + os.makedirs(image_folder) + img = torch.squeeze(pred.data.cpu()).numpy() + filename = os.path.join(image_folder, '{0:010d}.png'.format(i)) + vis_utils.save_depth_as_uint16png(img, filename) + + def conditional_summarize(self, mode, avg, is_best): + print('\n*\nSummary of ', mode, 'round') + print('' + 'RMSE={average.rmse:.3f}\n' + 'MAE={average.mae:.3f}\n' + 'Photo={average.photometric:.3f}\n' + 'iRMSE={average.irmse:.3f}\n' + 'iMAE={average.imae:.3f}\n' + 'squared_rel={average.squared_rel}\n' + 'silog={average.silog}\n' + 'Delta1={average.delta1:.3f}\n' + 'REL={average.absrel:.3f}\n' + 'Lg10={average.lg10:.3f}\n' + 't_GPU={time:.3f}'.format(average=avg, time=avg.gpu_time)) + if is_best and mode == 'val': + print('New best model by %s (was %.3f)' % + (self.args.rank_metric, + self.get_ranking_error(self.old_best_result))) + elif mode == 'val': + print('(best %s is %.3f)' % + (self.args.rank_metric, + self.get_ranking_error(self.best_result))) + print('*\n') + + +ignore_hidden = shutil.ignore_patterns('.', '..', '.git*', '*pycache*', + '*build', '*.fuse*', '*_drive_*') + + +def backup_source_code(backup_directory): + if os.path.exists(backup_directory): + shutil.rmtree(backup_directory) + shutil.copytree('.', backup_directory, ignore=ignore_hidden) + + +def adjust_learning_rate(lr_init, optimizer, epoch): + """Sets the learning rate to the initial LR decayed by 10 every 5 epochs""" + lr = lr_init * (0.1**(epoch // 5)) + for param_group in optimizer.param_groups: + param_group['lr'] = lr + return lr + + +def save_checkpoint(state, is_best, epoch, output_directory): + checkpoint_filename = os.path.join(output_directory, + 'checkpoint-' + str(epoch) + '.pth.tar') + torch.save(state, checkpoint_filename) + if is_best: + best_filename = os.path.join(output_directory, 'model_best.pth.tar') + shutil.copyfile(checkpoint_filename, best_filename) + if epoch > 0: + prev_checkpoint_filename = os.path.join( + output_directory, 'checkpoint-' + str(epoch - 1) + '.pth.tar') + if os.path.exists(prev_checkpoint_filename): + os.remove(prev_checkpoint_filename) + + +def get_folder_name(args): + # current_time = time.strftime('%Y-%m-%d@%H-%M') + # if args.use_pose: + # prefix = 'mode={}.w1={}.w2={}.'.format(args.train_mode, args.w1, + # args.w2) + # else: + # prefix = 'mode={}.'.format(args.train_mode) + # return os.path.join(args.result, + # prefix + 'input={}.resnet{}.criterion={}.lr={}.bs={}.wd={}.pretrained={}.jitter={}.time={}'. + # format(args.input, args.layers, args.criterion, \ + # args.lr, args.batch_size, args.weight_decay, \ + # args.pretrained, args.jitter, current_time + # )) + return os.path.join(args.result, 'test') + + +avgpool = torch.nn.AvgPool2d(kernel_size=2, stride=2).cuda() + + +def multiscale(img): + img1 = avgpool(img) + img2 = avgpool(img1) + img3 = avgpool(img2) + img4 = avgpool(img3) + img5 = avgpool(img4) + return img5, img4, img3, img2, img1 diff --git a/modelscope/models/cv/self_supervised_depth_completion/inverse_warp.py b/modelscope/models/cv/self_supervised_depth_completion/inverse_warp.py new file mode 100644 index 000000000..08963fc9c --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/inverse_warp.py @@ -0,0 +1,141 @@ +import torch +import torch.nn.functional as F + +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +class Intrinsics: + """Intrinsics""" + + def __init__(self, width, height, fu, fv, cu=0, cv=0): + self.height, self.width = height, width + self.fu, self.fv = fu, fv # fu, fv: focal length along the horizontal and vertical axes + + # cu, cv: optical center along the horizontal and vertical axes + self.cu = cu if cu > 0 else (width - 1) / 2.0 + self.cv = cv if cv > 0 else (height - 1) / 2.0 + + # U, V represent the homogeneous horizontal and vertical coordinates in the pixel space + self.U = torch.arange(start=0, end=width).expand(height, width).float() + self.V = torch.arange( + start=0, end=height).expand(width, height).t().float() + + # X_cam, Y_cam represent the homogeneous x, y coordinates (assuming depth z=1) in the camera coordinate system + self.X_cam = (self.U - self.cu) / self.fu + self.Y_cam = (self.V - self.cv) / self.fv + + self.is_cuda = False + + def cuda(self): + self.X_cam.data = self.X_cam.data.cuda() + self.Y_cam.data = self.Y_cam.data.cuda() + self.is_cuda = True + return self + + def scale(self, height, width): + # return a new set of corresponding intrinsic parameters for the scaled image + ratio_u = float(width) / self.width + ratio_v = float(height) / self.height + fu = ratio_u * self.fu + fv = ratio_v * self.fv + cu = ratio_u * self.cu + cv = ratio_v * self.cv + new_intrinsics = Intrinsics(width, height, fu, fv, cu, cv) + if self.is_cuda: + new_intrinsics.cuda() + return new_intrinsics + + def __print__(self): + logger.info( + 'size=({},{})\nfocal length=({},{})\noptical center=({},{})'. + format(self.height, self.width, self.fv, self.fu, self.cv, + self.cu)) + + +def image_to_pointcloud(depth, intrinsics): + assert depth.dim() == 4 + assert depth.size(1) == 1 + + X = depth * intrinsics.X_cam + Y = depth * intrinsics.Y_cam + return torch.cat((X, Y, depth), dim=1) + + +def pointcloud_to_image(pointcloud, intrinsics): + assert pointcloud.dim() == 4 + + batch_size = pointcloud.size(0) + X = pointcloud[:, 0, :, :] # .view(batch_size, -1) + Y = pointcloud[:, 1, :, :] # .view(batch_size, -1) + Z = pointcloud[:, 2, :, :].clamp(min=1e-3) # .view(batch_size, -1) + + # compute pixel coordinates + U_proj = intrinsics.fu * X / Z + intrinsics.cu # horizontal pixel coordinate + V_proj = intrinsics.fv * Y / Z + intrinsics.cv # vertical pixel coordinate + + # normalization to [-1, 1], required by torch.nn.functional.grid_sample + w = intrinsics.width + h = intrinsics.height + U_proj_normalized = (2 * U_proj / (w - 1) - 1).view(batch_size, -1) + V_proj_normalized = (2 * V_proj / (h - 1) - 1).view(batch_size, -1) + + # This was important since PyTorch didn't do as it claimed for points out of boundary + # See https://github.com/ClementPinard/SfmLearner-Pytorch/blob/master/inverse_warp.py + # Might not be necessary any more + U_proj_mask = ((U_proj_normalized > 1) + (U_proj_normalized < -1)).detach() + U_proj_normalized[U_proj_mask] = 2 + V_proj_mask = ((V_proj_normalized > 1) + (V_proj_normalized < -1)).detach() + V_proj_normalized[V_proj_mask] = 2 + + pixel_coords = torch.stack([U_proj_normalized, V_proj_normalized], + dim=2) # [B, H*W, 2] + return pixel_coords.view(batch_size, intrinsics.height, intrinsics.width, + 2) + + +def batch_multiply(batch_scalar, batch_matrix): + # input: batch_scalar of size b, batch_matrix of size b * 3 * 3 + # output: batch_matrix of size b * 3 * 3 + batch_size = batch_scalar.size(0) + output = batch_matrix.clone() + for i in range(batch_size): + output[i] = batch_scalar[i] * batch_matrix[i] + return output + + +def transform_curr_to_near(pointcloud_curr, r_mat, t_vec, intrinsics): + # translation and rotmat represent the transformation from tgt pose to src pose + batch_size = pointcloud_curr.size(0) + XYZ_ = torch.bmm(r_mat, pointcloud_curr.view(batch_size, 3, -1)) + + X = (XYZ_[:, 0, :] + t_vec[:, 0].unsqueeze(1)).view( + -1, 1, intrinsics.height, intrinsics.width) + Y = (XYZ_[:, 1, :] + t_vec[:, 1].unsqueeze(1)).view( + -1, 1, intrinsics.height, intrinsics.width) + Z = (XYZ_[:, 2, :] + t_vec[:, 2].unsqueeze(1)).view( + -1, 1, intrinsics.height, intrinsics.width) + + pointcloud_near = torch.cat((X, Y, Z), dim=1) + + return pointcloud_near + + +def homography_from(rgb_near, depth_curr, r_mat, t_vec, intrinsics): + # inverse warp the RGB image from the nearby frame to the current frame + + # to ensure dimension consistency + r_mat = r_mat.view(-1, 3, 3) + t_vec = t_vec.view(-1, 3) + + # compute source pixel coordinate + pointcloud_curr = image_to_pointcloud(depth_curr, intrinsics) + pointcloud_near = transform_curr_to_near(pointcloud_curr, r_mat, t_vec, + intrinsics) + pixel_coords_near = pointcloud_to_image(pointcloud_near, intrinsics) + + # the warping + warped = F.grid_sample(rgb_near, pixel_coords_near) + + return warped diff --git a/modelscope/models/cv/self_supervised_depth_completion/metrics.py b/modelscope/models/cv/self_supervised_depth_completion/metrics.py new file mode 100644 index 000000000..58bb9d5f2 --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/metrics.py @@ -0,0 +1,181 @@ +import math + +import numpy as np +import torch + +lg_e_10 = math.log(10) + + +def log10(x): + """Convert a new tensor with the base-10 logarithm of the elements of x. """ + return torch.log(x) / lg_e_10 + + +class Result(object): + """Result""" + + def __init__(self): + self.irmse = 0 + self.imae = 0 + self.mse = 0 + self.rmse = 0 + self.mae = 0 + self.absrel = 0 + self.squared_rel = 0 + self.lg10 = 0 + self.delta1 = 0 + self.delta2 = 0 + self.delta3 = 0 + self.data_time = 0 + self.gpu_time = 0 + self.silog = 0 # Scale invariant logarithmic error [log(m)*100] + self.photometric = 0 + + def set_to_worst(self): + self.irmse = np.inf + self.imae = np.inf + self.mse = np.inf + self.rmse = np.inf + self.mae = np.inf + self.absrel = np.inf + self.squared_rel = np.inf + self.lg10 = np.inf + self.silog = np.inf + self.delta1 = 0 + self.delta2 = 0 + self.delta3 = 0 + self.data_time = 0 + self.gpu_time = 0 + + def update(self, + irmse, + imae, + mse, + rmse, + mae, + absrel, + squared_rel, + lg10, + delta1, + delta2, + delta3, + gpu_time, + data_time, + silog, + photometric=0): + """update""" + self.irmse = irmse + self.imae = imae + self.mse = mse + self.rmse = rmse + self.mae = mae + self.absrel = absrel + self.squared_rel = squared_rel + self.lg10 = lg10 + self.delta1 = delta1 + self.delta2 = delta2 + self.delta3 = delta3 + self.data_time = data_time + self.gpu_time = gpu_time + self.silog = silog + self.photometric = photometric + + def evaluate(self, output, target, photometric=0): + """evaluate""" + valid_mask = target > 0.1 + + # convert from meters to mm + output_mm = 1e3 * output[valid_mask] + target_mm = 1e3 * target[valid_mask] + + abs_diff = (output_mm - target_mm).abs() + + self.mse = float((torch.pow(abs_diff, 2)).mean()) + self.rmse = math.sqrt(self.mse) + self.mae = float(abs_diff.mean()) + self.lg10 = float((log10(output_mm) - log10(target_mm)).abs().mean()) + self.absrel = float((abs_diff / target_mm).mean()) + self.squared_rel = float(((abs_diff / target_mm)**2).mean()) + + maxRatio = torch.max(output_mm / target_mm, target_mm / output_mm) + self.delta1 = float((maxRatio < 1.25).float().mean()) + self.delta2 = float((maxRatio < 1.25**2).float().mean()) + self.delta3 = float((maxRatio < 1.25**3).float().mean()) + self.data_time = 0 + self.gpu_time = 0 + + # silog uses meters + err_log = torch.log(target[valid_mask]) - torch.log(output[valid_mask]) + normalized_squared_log = (err_log**2).mean() + log_mean = err_log.mean() + self.silog = math.sqrt(normalized_squared_log + - log_mean * log_mean) * 100 + + # convert from meters to km + inv_output_km = (1e-3 * output[valid_mask])**(-1) + inv_target_km = (1e-3 * target[valid_mask])**(-1) + abs_inv_diff = (inv_output_km - inv_target_km).abs() + self.irmse = math.sqrt((torch.pow(abs_inv_diff, 2)).mean()) + self.imae = float(abs_inv_diff.mean()) + + self.photometric = float(photometric) + + +class AverageMeter(object): + """AverageMeter""" + + def __init__(self): + self.reset() + + def reset(self): + """reset""" + self.count = 0.0 + self.sum_irmse = 0 + self.sum_imae = 0 + self.sum_mse = 0 + self.sum_rmse = 0 + self.sum_mae = 0 + self.sum_absrel = 0 + self.sum_squared_rel = 0 + self.sum_lg10 = 0 + self.sum_delta1 = 0 + self.sum_delta2 = 0 + self.sum_delta3 = 0 + self.sum_data_time = 0 + self.sum_gpu_time = 0 + self.sum_photometric = 0 + self.sum_silog = 0 + + def update(self, result, gpu_time, data_time, n=1): + """update""" + self.count += n + self.sum_irmse += n * result.irmse + self.sum_imae += n * result.imae + self.sum_mse += n * result.mse + self.sum_rmse += n * result.rmse + self.sum_mae += n * result.mae + self.sum_absrel += n * result.absrel + self.sum_squared_rel += n * result.squared_rel + self.sum_lg10 += n * result.lg10 + self.sum_delta1 += n * result.delta1 + self.sum_delta2 += n * result.delta2 + self.sum_delta3 += n * result.delta3 + self.sum_data_time += n * data_time + self.sum_gpu_time += n * gpu_time + self.sum_silog += n * result.silog + self.sum_photometric += n * result.photometric + + def average(self): + """average""" + avg = Result() + if self.count > 0: + avg.update( + self.sum_irmse / self.count, self.sum_imae / self.count, + self.sum_mse / self.count, self.sum_rmse / self.count, + self.sum_mae / self.count, self.sum_absrel / self.count, + self.sum_squared_rel / self.count, self.sum_lg10 / self.count, + self.sum_delta1 / self.count, self.sum_delta2 / self.count, + self.sum_delta3 / self.count, self.sum_gpu_time / self.count, + self.sum_data_time / self.count, self.sum_silog / self.count, + self.sum_photometric / self.count) + return avg diff --git a/modelscope/models/cv/self_supervised_depth_completion/model.py b/modelscope/models/cv/self_supervised_depth_completion/model.py new file mode 100644 index 000000000..2a56b3178 --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/model.py @@ -0,0 +1,215 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from torchvision.models import resnet + + +def init_weights(m): + """init_weights""" + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): + m.weight.data.normal_(0, 1e-3) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.ConvTranspose2d): + m.weight.data.normal_(0, 1e-3) + if m.bias is not None: + m.bias.data.zero_() + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + +def conv_bn_relu(in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + bn=True, + relu=True): + """conv_bn_relu""" + bias = not bn + layers = [] + layers.append( + nn.Conv2d( + in_channels, out_channels, kernel_size, stride, padding, + bias=bias)) + if bn: + layers.append(nn.BatchNorm2d(out_channels)) + if relu: + layers.append(nn.LeakyReLU(0.2, inplace=True)) + layers = nn.Sequential(*layers) + + # initialize the weights + for m in layers.modules(): + init_weights(m) + + return layers + + +def convt_bn_relu(in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + output_padding=0, + bn=True, + relu=True): + """convt_bn_relu""" + bias = not bn + layers = [] + layers.append( + nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size, + stride, + padding, + output_padding, + bias=bias)) + if bn: + layers.append(nn.BatchNorm2d(out_channels)) + if relu: + layers.append(nn.LeakyReLU(0.2, inplace=True)) + layers = nn.Sequential(*layers) + + # initialize the weights + for m in layers.modules(): + init_weights(m) + + return layers + + +class DepthCompletionNet(nn.Module): + """DepthCompletionNet""" + + def __init__(self, args): + assert ( + args.layers in [18, 34, 50, 101, 152] + ), f'Only layers 18, 34, 50, 101, and 152 are defined, but got {layers}'.format( + layers) + super(DepthCompletionNet, self).__init__() + self.modality = args.input + + if 'd' in self.modality: + channels = 64 // len(self.modality) + self.conv1_d = conv_bn_relu( + 1, channels, kernel_size=3, stride=1, padding=1) + if 'rgb' in self.modality: + channels = 64 * 3 // len(self.modality) + self.conv1_img = conv_bn_relu( + 3, channels, kernel_size=3, stride=1, padding=1) + elif 'g' in self.modality: + channels = 64 // len(self.modality) + self.conv1_img = conv_bn_relu( + 1, channels, kernel_size=3, stride=1, padding=1) + + pretrained_model = resnet.__dict__['resnet{}'.format(args.layers)]( + pretrained=args.pretrained) + if not args.pretrained: + pretrained_model.apply(init_weights) + # self.maxpool = pretrained_model._modules['maxpool'] + self.conv2 = pretrained_model._modules['layer1'] + self.conv3 = pretrained_model._modules['layer2'] + self.conv4 = pretrained_model._modules['layer3'] + self.conv5 = pretrained_model._modules['layer4'] + del pretrained_model # clear memory + + # define number of intermediate channels + if args.layers <= 34: + num_channels = 512 + elif args.layers >= 50: + num_channels = 2048 + self.conv6 = conv_bn_relu( + num_channels, 512, kernel_size=3, stride=2, padding=1) + + # decoding layers + kernel_size = 3 + stride = 2 + self.convt5 = convt_bn_relu( + in_channels=512, + out_channels=256, + kernel_size=kernel_size, + stride=stride, + padding=1, + output_padding=1) + self.convt4 = convt_bn_relu( + in_channels=768, + out_channels=128, + kernel_size=kernel_size, + stride=stride, + padding=1, + output_padding=1) + self.convt3 = convt_bn_relu( + in_channels=(256 + 128), + out_channels=64, + kernel_size=kernel_size, + stride=stride, + padding=1, + output_padding=1) + self.convt2 = convt_bn_relu( + in_channels=(128 + 64), + out_channels=64, + kernel_size=kernel_size, + stride=stride, + padding=1, + output_padding=1) + self.convt1 = convt_bn_relu( + in_channels=128, + out_channels=64, + kernel_size=kernel_size, + stride=1, + padding=1) + self.convtf = conv_bn_relu( + in_channels=128, + out_channels=1, + kernel_size=1, + stride=1, + bn=False, + relu=False) + + def forward(self, x): + """forward""" + # first layer + if 'd' in self.modality: + conv1_d = self.conv1_d(x['d']) + if 'rgb' in self.modality: + conv1_img = self.conv1_img(x['rgb']) + elif 'g' in self.modality: + conv1_img = self.conv1_img(x['g']) + + if self.modality == 'rgbd' or self.modality == 'gd': + conv1 = torch.cat((conv1_d, conv1_img), 1) + else: + conv1 = conv1_d if (self.modality == 'd') else conv1_img + + conv2 = self.conv2(conv1) + conv3 = self.conv3(conv2) # batchsize * ? * 176 * 608 + conv4 = self.conv4(conv3) # batchsize * ? * 88 * 304 + conv5 = self.conv5(conv4) # batchsize * ? * 44 * 152 + conv6 = self.conv6(conv5) # batchsize * ? * 22 * 76 + + # decoder + convt5 = self.convt5(conv6) + y = torch.cat((convt5, conv5), 1) + + convt4 = self.convt4(y) + y = torch.cat((convt4, conv4), 1) + + convt3 = self.convt3(y) + y = torch.cat((convt3, conv3), 1) + + convt2 = self.convt2(y) + y = torch.cat((convt2, conv2), 1) + + convt1 = self.convt1(y) + y = torch.cat((convt1, conv1), 1) + + y = self.convtf(y) + + if self.training: + return 100 * y + else: + min_distance = 0.9 + return F.relu( + 100 * y - min_distance + ) + min_distance # the minimum range of Velodyne is around 3 feet ~= 0.9m diff --git a/modelscope/models/cv/self_supervised_depth_completion/self_supervised_depth_completion.py b/modelscope/models/cv/self_supervised_depth_completion/self_supervised_depth_completion.py new file mode 100644 index 000000000..4e7046f6b --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/self_supervised_depth_completion.py @@ -0,0 +1,225 @@ +# import argparse +import os +import sys +import time +# import mmcv +from argparse import ArgumentParser +# import torchvision +from os import makedirs + +import cv2 +import numpy as np +import torch +import torch.nn.parallel +import torch.optim +import torch.utils.data +from tqdm import tqdm + +from modelscope.metainfo import Models +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.models.cv.self_supervised_depth_completion import (criteria, + helper) +from modelscope.models.cv.self_supervised_depth_completion.dataloaders.kitti_loader import ( + KittiDepth, input_options, load_calib, oheight, owidth) +from modelscope.models.cv.self_supervised_depth_completion.inverse_warp import ( + Intrinsics, homography_from) +from modelscope.models.cv.self_supervised_depth_completion.metrics import ( + AverageMeter, Result) +from modelscope.models.cv.self_supervised_depth_completion.model import \ + DepthCompletionNet +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE' + +sys.path.append(os.path.dirname(os.path.abspath(__file__))) + +# from modelscope.utils.config import Config + +m_logger = get_logger() + + +class ArgsList(): + """ArgsList Class""" + + def __init__(self) -> None: + self.workers = 4 + self.epochs = 11 + self.start_epoch = 0 + self.criterion = 'l2' + self.batch_size = 1 + self.learning_rate = 1e-5 + self.weight_decay = 0 + self.print_freq = 10 + self.resume = '' + self.data_folder = '../data' + self.input = 'gd' + self.layers = 34 + self.pretrained = True + self.val = 'select' + self.jitter = 0.1 + self.rank_metric = 'rmse' + self.evaluate = '' + self.cpu = False + + +@MODELS.register_module( + Tasks.self_supervised_depth_completion, + module_name=Models.self_supervised_depth_completion) +class SelfSupervisedDepthCompletion(TorchModel): + """SelfSupervisedDepthCompletion Class""" + + def __init__(self, model_dir: str, **kwargs): + """str -- model file root.""" + super().__init__(model_dir, **kwargs) + + args = ArgsList() + # define loss functions + self.depth_criterion = criteria.MaskedMSELoss() + self.photometric_criterion = criteria.PhotometricLoss() + self.smoothness_criterion = criteria.SmoothnessLoss() + + # args.use_pose = ('photo' in args.train_mode) + args.use_pose = True + # args.pretrained = not args.no_pretrained + args.use_rgb = ('rgb' in args.input) or args.use_pose + args.use_d = 'd' in args.input + args.use_g = 'g' in args.input + + args.evaluate = os.path.join(self.model_dir, 'model_best.pth') + + if args.use_pose: + args.w1, args.w2 = 0.1, 0.1 + else: + args.w1, args.w2 = 0, 0 + + self.cuda = torch.cuda.is_available() and not args.cpu + if self.cuda: + import torch.backends.cudnn as cudnn + cudnn.benchmark = True + self.device = torch.device('cuda') + else: + self.device = torch.device('cpu') + print("=> using '{}' for computation.".format(self.device)) + + args_new = args + if os.path.isfile(args.evaluate): + print( + "=> loading checkpoint '{}' ... ".format(args.evaluate), + end='') + self.checkpoint = torch.load( + args.evaluate, map_location=self.device) + args = self.checkpoint['args'] + args.val = args_new.val + print('Completed.') + else: + print("No model found at '{}'".format(args.evaluate)) + return + + print('=> creating model and optimizer ... ', end='') + model = DepthCompletionNet(args).to(self.device) + model_named_params = [ + p for _, p in model.named_parameters() if p.requires_grad + ] + optimizer = torch.optim.Adam( + model_named_params, lr=args.lr, weight_decay=args.weight_decay) + print('completed.') + if self.checkpoint is not None: + model.load_state_dict(self.checkpoint['model']) + optimizer.load_state_dict(self.checkpoint['optimizer']) + print('=> checkpoint state loaded.') + + model = torch.nn.DataParallel(model) + + self.model = model + self.args = args + + def iterate(self, mode, args, loader, model, optimizer, logger, epoch): + """iterate data""" + block_average_meter = AverageMeter() + average_meter = AverageMeter() + meters = [block_average_meter, average_meter] + merged_img = None + # switch to appropriate mode + assert mode in ['train', 'val', 'eval', 'test_prediction', 'test_completion'], \ + 'unsupported mode: {}'.format(mode) + model.eval() + lr = 0 + + for i, batch_data in enumerate(loader): + start = time.time() + batch_data = { + key: val.to(self.device) + for key, val in batch_data.items() if val is not None + } + gt = batch_data[ + 'gt'] if mode != 'test_prediction' and mode != 'test_completion' else None + data_time = time.time() - start + + start = time.time() + pred = model(batch_data) + photometric_loss = 0 + gpu_time = time.time() - start + + # measure accuracy and record loss + with torch.no_grad(): + mini_batch_size = next(iter(batch_data.values())).size(0) + result = Result() + if mode != 'test_prediction' and mode != 'test_completion': + result.evaluate(pred.data, gt.data, photometric_loss) + [ + m.update(result, gpu_time, data_time, mini_batch_size) + for m in meters + ] + logger.conditional_print(mode, i, epoch, lr, len(loader), + block_average_meter, average_meter) + merged_img = logger.conditional_save_img_comparison( + mode, i, batch_data, pred, epoch) + merged_img = cv2.cvtColor(merged_img, cv2.COLOR_RGB2BGR) + logger.conditional_save_pred(mode, i, pred, epoch) + + avg = logger.conditional_save_info(mode, average_meter, epoch) + is_best = logger.rank_conditional_save_best(mode, avg, epoch) + logger.save_img_comparison_as_best(mode, epoch) + logger.conditional_summarize(mode, avg, is_best) + + return avg, is_best, merged_img + + def forward(self, source_dir): + """main function""" + + args = self.args + args.data_folder = source_dir + args.result = os.path.join(args.data_folder, 'results') + if args.use_pose: + # hard-coded KITTI camera intrinsics + K = load_calib(args) + fu, fv = float(K[0, 0]), float(K[1, 1]) + cu, cv = float(K[0, 2]), float(K[1, 2]) + kitti_intrinsics = Intrinsics(owidth, oheight, fu, fv, cu, cv) + if self.cuda: + kitti_intrinsics = kitti_intrinsics.cuda() + + # Data loading code + print('=> creating data loaders ... ') + val_dataset = KittiDepth('val', self.args) + val_loader = torch.utils.data.DataLoader( + val_dataset, + batch_size=1, + shuffle=False, + num_workers=2, + pin_memory=True) # set batch size to be 1 for validation + print('\t==> val_loader size:{}'.format(len(val_loader))) + + # create backups and results folder + logger = helper.logger(self.args) + if self.checkpoint is not None: + logger.best_result = self.checkpoint['best_result'] + + print('=> starting model evaluation ...') + result, is_best, merged_img = self.iterate('val', self.args, + val_loader, self.model, + None, logger, + self.checkpoint['epoch']) + return merged_img diff --git a/modelscope/models/cv/self_supervised_depth_completion/vis_utils.py b/modelscope/models/cv/self_supervised_depth_completion/vis_utils.py new file mode 100644 index 000000000..38dfa43fa --- /dev/null +++ b/modelscope/models/cv/self_supervised_depth_completion/vis_utils.py @@ -0,0 +1,119 @@ +import os + +import cv2 +import matplotlib.pyplot as plt +import numpy as np +from PIL import Image + +if not ('DISPLAY' in os.environ): + import matplotlib as mpl + mpl.use('Agg') + +cmap = plt.cm.jet + + +def depth_colorize(depth): + depth = (depth - np.min(depth)) / (np.max(depth) - np.min(depth)) + depth = 255 * cmap(depth)[:, :, :3] # H, W, C + return depth.astype('uint8') + + +def merge_into_row(ele, pred): + + def preprocess_depth(x): + y = np.squeeze(x.data.cpu().numpy()) + return depth_colorize(y) + + # if is gray, transforms to rgb + img_list = [] + if 'rgb' in ele: + rgb = np.squeeze(ele['rgb'][0, ...].data.cpu().numpy()) + rgb = np.transpose(rgb, (1, 2, 0)) + img_list.append(rgb) + elif 'g' in ele: + g = np.squeeze(ele['g'][0, ...].data.cpu().numpy()) + g = np.array(Image.fromarray(g).convert('RGB')) + img_list.append(g) + if 'd' in ele: + img_list.append(preprocess_depth(ele['d'][0, ...])) + img_list.append(preprocess_depth(pred[0, ...])) + if 'gt' in ele: + img_list.append(preprocess_depth(ele['gt'][0, ...])) + + img_merge = np.hstack(img_list) + return img_merge.astype('uint8') + + +def add_row(img_merge, row): + return np.vstack([img_merge, row]) + + +def save_image(img_merge, filename): + image_to_write = cv2.cvtColor(img_merge, cv2.COLOR_RGB2BGR) + cv2.imwrite(filename, image_to_write) + + +def save_depth_as_uint16png(img, filename): + img = (img * 256).astype('uint16') + cv2.imwrite(filename, img) + + +if ('DISPLAY' in os.environ): + f, axarr = plt.subplots(4, 1) + plt.tight_layout() + plt.ion() + + +def display_warping(rgb_tgt, pred_tgt, warped): + + def preprocess(rgb_tgt, pred_tgt, warped): + rgb_tgt = 255 * np.transpose( + np.squeeze(rgb_tgt.data.cpu().numpy()), (1, 2, 0)) # H, W, C + # depth = np.squeeze(depth.cpu().numpy()) + # depth = depth_colorize(depth) + + # convert to log-scale + pred_tgt = np.squeeze(pred_tgt.data.cpu().numpy()) + # pred_tgt[pred_tgt<=0] = 0.9 # remove negative predictions + # pred_tgt = np.log10(pred_tgt) + + pred_tgt = depth_colorize(pred_tgt) + + warped = 255 * np.transpose( + np.squeeze(warped.data.cpu().numpy()), (1, 2, 0)) # H, W, C + recon_err = np.absolute( + warped.astype('float') - rgb_tgt.astype('float')) * ( + warped > 0) + recon_err = recon_err[:, :, 0] + recon_err[:, :, 1] + recon_err[:, :, + 2] + recon_err = depth_colorize(recon_err) + return rgb_tgt.astype('uint8'), warped.astype( + 'uint8'), recon_err, pred_tgt + + rgb_tgt, warped, recon_err, pred_tgt = preprocess(rgb_tgt, pred_tgt, + warped) + + # 1st column + # column = 0 + axarr[0].imshow(rgb_tgt) + axarr[0].axis('off') + axarr[0].axis('equal') + # axarr[0, column].set_title('rgb_tgt') + + axarr[1].imshow(warped) + axarr[1].axis('off') + axarr[1].axis('equal') + # axarr[1, column].set_title('warped') + + axarr[2].imshow(recon_err, 'hot') + axarr[2].axis('off') + axarr[2].axis('equal') + # axarr[2, column].set_title('recon_err error') + + axarr[3].imshow(pred_tgt, 'hot') + axarr[3].axis('off') + axarr[3].axis('equal') + # axarr[3, column].set_title('pred_tgt') + + # plt.show() + plt.pause(0.001) diff --git a/modelscope/models/cv/shop_segmentation/head_fpn.py b/modelscope/models/cv/shop_segmentation/head_fpn.py index 0d4027cb7..a1de71a97 100644 --- a/modelscope/models/cv/shop_segmentation/head_fpn.py +++ b/modelscope/models/cv/shop_segmentation/head_fpn.py @@ -9,8 +9,7 @@ import torch import torch.nn as nn from mmcv.cnn import ConvModule -from timm.models.layers.drop import drop_path -from timm.models.layers.weight_init import trunc_normal_ +from timm.models.layers import drop_path, trunc_normal_ from .common import Upsample, resize diff --git a/modelscope/models/cv/shop_segmentation/models.py b/modelscope/models/cv/shop_segmentation/models.py index a206e9f1c..e6c389d66 100644 --- a/modelscope/models/cv/shop_segmentation/models.py +++ b/modelscope/models/cv/shop_segmentation/models.py @@ -11,8 +11,7 @@ import torch import torch.nn.functional as F import torch.utils.checkpoint as checkpoint -from timm.models.layers.drop import drop_path -from timm.models.layers.weight_init import trunc_normal_ +from timm.models.layers import drop_path, trunc_normal_ from torch import nn diff --git a/modelscope/models/cv/shop_segmentation/neck_fpn.py b/modelscope/models/cv/shop_segmentation/neck_fpn.py index d344de713..1b63bcd16 100644 --- a/modelscope/models/cv/shop_segmentation/neck_fpn.py +++ b/modelscope/models/cv/shop_segmentation/neck_fpn.py @@ -8,8 +8,7 @@ import torch.nn as nn import torch.nn.functional as F from mmcv.cnn import ConvModule -from timm.models.layers.drop import drop_path -from timm.models.layers.weight_init import trunc_normal_ +from timm.models.layers import drop_path, trunc_normal_ from .common import resize diff --git a/modelscope/models/cv/text_driven_segmentation/clip.py b/modelscope/models/cv/text_driven_segmentation/clip.py index 1cec5f397..9dd40d0eb 100644 --- a/modelscope/models/cv/text_driven_segmentation/clip.py +++ b/modelscope/models/cv/text_driven_segmentation/clip.py @@ -8,9 +8,10 @@ import warnings from typing import Any, List, Union +import packaging +import packaging.version import torch from PIL import Image -from pkg_resources import packaging from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize, ToTensor) from tqdm import tqdm diff --git a/modelscope/models/cv/video_depth_estimation/utils/depth.py b/modelscope/models/cv/video_depth_estimation/utils/depth.py index 5fbf6aa6d..2fc16a01e 100644 --- a/modelscope/models/cv/video_depth_estimation/utils/depth.py +++ b/modelscope/models/cv/video_depth_estimation/utils/depth.py @@ -3,7 +3,9 @@ import numpy as np import torch import torchvision.transforms as transforms -from matplotlib.cm import get_cmap +# from matplotlib.cm import get_cmap +# compatible with matplotlib 3.9.0 +from matplotlib.pyplot import get_cmap from modelscope.models.cv.video_depth_estimation.utils.image import ( flip_lr, gradient_x, gradient_y, interpolate_image, load_image) diff --git a/modelscope/models/cv/video_frame_interpolation/__init__.py b/modelscope/models/cv/video_frame_interpolation/__init__.py index 657a375ad..11492faf0 100644 --- a/modelscope/models/cv/video_frame_interpolation/__init__.py +++ b/modelscope/models/cv/video_frame_interpolation/__init__.py @@ -5,9 +5,10 @@ if TYPE_CHECKING: from .VFINet_arch import VFINet + from .rife import RIFEModel else: - _import_structure = {'VFINet_arch': ['VFINet']} + _import_structure = {'VFINet_arch': ['VFINet'], 'rife': ['RIFEModel']} import sys diff --git a/modelscope/models/cv/video_frame_interpolation/rife/IFNet_HDv3.py b/modelscope/models/cv/video_frame_interpolation/rife/IFNet_HDv3.py new file mode 100644 index 000000000..e904aad28 --- /dev/null +++ b/modelscope/models/cv/video_frame_interpolation/rife/IFNet_HDv3.py @@ -0,0 +1,158 @@ +# The implementation here is modified based on ECCV2022-RIFE, +# originally MIT License, Copyright (c) Megvii Inc., +# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .warplayer import warp + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +def conv(in_planes, + out_planes, + kernel_size=3, + stride=1, + padding=1, + dilation=1): + return nn.Sequential( + nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=True), nn.PReLU(out_planes)) + + +def conv_bn(in_planes, + out_planes, + kernel_size=3, + stride=1, + padding=1, + dilation=1): + return nn.Sequential( + nn.Conv2d( + in_planes, + out_planes, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + bias=False), nn.BatchNorm2d(out_planes), nn.PReLU(out_planes)) + + +class IFBlock(nn.Module): + + def __init__(self, in_planes, c=64): + super(IFBlock, self).__init__() + self.conv0 = nn.Sequential( + conv(in_planes, c // 2, 3, 2, 1), + conv(c // 2, c, 3, 2, 1), + ) + self.convblock0 = nn.Sequential(conv(c, c), conv(c, c)) + self.convblock1 = nn.Sequential(conv(c, c), conv(c, c)) + self.convblock2 = nn.Sequential(conv(c, c), conv(c, c)) + self.convblock3 = nn.Sequential(conv(c, c), conv(c, c)) + self.conv1 = nn.Sequential( + nn.ConvTranspose2d(c, c // 2, 4, 2, 1), + nn.PReLU(c // 2), + nn.ConvTranspose2d(c // 2, 4, 4, 2, 1), + ) + self.conv2 = nn.Sequential( + nn.ConvTranspose2d(c, c // 2, 4, 2, 1), + nn.PReLU(c // 2), + nn.ConvTranspose2d(c // 2, 1, 4, 2, 1), + ) + + def forward(self, x, flow, scale=1): + x = F.interpolate( + x, + scale_factor=1. / scale, + mode='bilinear', + align_corners=False, + recompute_scale_factor=False) + flow = F.interpolate( + flow, + scale_factor=1. / scale, + mode='bilinear', + align_corners=False, + recompute_scale_factor=False) * 1. / scale + feat = self.conv0(torch.cat((x, flow), 1)) + feat = self.convblock0(feat) + feat + feat = self.convblock1(feat) + feat + feat = self.convblock2(feat) + feat + feat = self.convblock3(feat) + feat + flow = self.conv1(feat) + mask = self.conv2(feat) + flow = F.interpolate( + flow, + scale_factor=scale, + mode='bilinear', + align_corners=False, + recompute_scale_factor=False) * scale + mask = F.interpolate( + mask, + scale_factor=scale, + mode='bilinear', + align_corners=False, + recompute_scale_factor=False) + return flow, mask + + +class IFNet(nn.Module): + + def __init__(self): + super(IFNet, self).__init__() + self.block0 = IFBlock(7 + 4, c=90) + self.block1 = IFBlock(7 + 4, c=90) + self.block2 = IFBlock(7 + 4, c=90) + self.block_tea = IFBlock(10 + 4, c=90) + # self.contextnet = Contextnet() + # self.unet = Unet() + + def forward(self, x, scale_list=[4, 2, 1], training=False): + if training is False: + channel = x.shape[1] // 2 + img0 = x[:, :channel] + img1 = x[:, channel:] + flow_list = [] + merged = [] + mask_list = [] + warped_img0 = img0 + warped_img1 = img1 + flow = (x[:, :4]).detach() * 0 + mask = (x[:, :1]).detach() * 0 + # loss_cons = 0 + block = [self.block0, self.block1, self.block2] + for i in range(3): + f0, m0 = block[i]( + torch.cat((warped_img0[:, :3], warped_img1[:, :3], mask), 1), + flow, + scale=scale_list[i]) + f1, m1 = block[i]( + torch.cat((warped_img1[:, :3], warped_img0[:, :3], -mask), 1), + torch.cat((flow[:, 2:4], flow[:, :2]), 1), + scale=scale_list[i]) + flow = flow + (f0 + torch.cat((f1[:, 2:4], f1[:, :2]), 1)) / 2 + mask = mask + (m0 + (-m1)) / 2 + mask_list.append(mask) + flow_list.append(flow) + warped_img0 = warp(img0, flow[:, :2]) + warped_img1 = warp(img1, flow[:, 2:4]) + merged.append((warped_img0, warped_img1)) + ''' + c0 = self.contextnet(img0, flow[:, :2]) + c1 = self.contextnet(img1, flow[:, 2:4]) + tmp = self.unet(img0, img1, warped_img0, warped_img1, mask, flow, c0, c1) + res = tmp[:, 1:4] * 2 - 1 + ''' + for i in range(3): + mask_list[i] = torch.sigmoid(mask_list[i]) + merged[i] = merged[i][0] * mask_list[i] + merged[i][1] * ( + 1 - mask_list[i]) + # merged[i] = torch.clamp(merged[i] + res, 0, 1) + return flow_list, mask_list[2], merged diff --git a/modelscope/models/cv/video_frame_interpolation/rife/RIFE_HDv3.py b/modelscope/models/cv/video_frame_interpolation/rife/RIFE_HDv3.py new file mode 100644 index 000000000..090b7cd76 --- /dev/null +++ b/modelscope/models/cv/video_frame_interpolation/rife/RIFE_HDv3.py @@ -0,0 +1,124 @@ +# The implementation here is modified based on ECCV2022-RIFE, +# originally MIT License, Copyright (c) Megvii Inc., +# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE + +import itertools + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.optim as optim +from torch.nn.parallel import DistributedDataParallel as DDP +from torch.optim import AdamW + +from modelscope.metainfo import Models +from modelscope.models.base import Tensor +from modelscope.models.base.base_torch_model import TorchModel +from modelscope.models.builder import MODELS +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger +from .IFNet_HDv3 import * +from .loss import * +from .warplayer import warp + + +@MODELS.register_module( + Tasks.video_frame_interpolation, module_name=Models.rife) +class RIFEModel(TorchModel): + + def __init__(self, model_dir, *args, **kwargs): + super().__init__(model_dir, *args, **kwargs) + self.device = torch.device( + 'cuda' if torch.cuda.is_available() else 'cpu') + self.flownet = IFNet() + self.flownet.to(self.device) + self.optimG = AdamW( + self.flownet.parameters(), lr=1e-6, weight_decay=1e-4) + self.epe = EPE() + # self.vgg = VGGPerceptualLoss().to(device) + self.sobel = SOBEL() + self.load_model(model_dir, -1) + self.eval() + + def train(self): + self.flownet.train() + + def eval(self): + self.flownet.eval() + + def load_model(self, path, rank=0): + + def convert(param): + if rank == -1: + return { + k.replace('module.', ''): v + for k, v in param.items() if 'module.' in k + } + else: + return param + + if rank <= 0: + if torch.cuda.is_available(): + self.flownet.load_state_dict( + convert(torch.load('{}/flownet.pkl'.format(path)))) + else: + self.flownet.load_state_dict( + convert( + torch.load( + '{}/flownet.pkl'.format(path), + map_location='cpu'))) + + def save_model(self, path, rank=0): + if rank == 0: + torch.save(self.flownet.state_dict(), + '{}/flownet.pkl'.format(path)) + + def inference(self, img0, img1, scale=1.0): + imgs = torch.cat((img0, img1), 1) + scale_list = [4 / scale, 2 / scale, 1 / scale] + _, _, merged = self.flownet(imgs, scale_list) + return merged[2].detach() + + def forward(self, inputs): + img0 = inputs['img0'] + img1 = inputs['img1'] + scale = inputs['scale'] + return {'output': self.inference(img0, img1, scale)} + + def update(self, + imgs, + gt, + learning_rate=0, + mul=1, + training=True, + flow_gt=None): + for param_group in self.optimG.param_groups: + param_group['lr'] = learning_rate + # img0 = imgs[:, :3] + # img1 = imgs[:, 3:] + if training: + self.train() + else: + self.eval() + scale = [4, 2, 1] + flow, mask, merged = self.flownet( + torch.cat((imgs, gt), 1), scale=scale, training=training) + loss_l1 = (merged[2] - gt).abs().mean() + loss_smooth = self.sobel(flow[2], flow[2] * 0).mean() + # loss_vgg = self.vgg(merged[2], gt) + if training: + self.optimG.zero_grad() + loss_G = loss_cons + loss_smooth * 0.1 + loss_G.backward() + self.optimG.step() + # else: + # flow_teacher = flow[2] + return merged[2], { + 'mask': mask, + 'flow': flow[2][:, :2], + 'loss_l1': loss_l1, + 'loss_cons': loss_cons, + 'loss_smooth': loss_smooth, + } diff --git a/modelscope/models/cv/video_frame_interpolation/rife/__init__.py b/modelscope/models/cv/video_frame_interpolation/rife/__init__.py new file mode 100644 index 000000000..af475199c --- /dev/null +++ b/modelscope/models/cv/video_frame_interpolation/rife/__init__.py @@ -0,0 +1,5 @@ +# The implementation here is modified based on ECCV2022-RIFE, +# originally MIT License, Copyright (c) Megvii Inc., +# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE + +from .RIFE_HDv3 import RIFEModel diff --git a/modelscope/models/cv/video_frame_interpolation/rife/loss.py b/modelscope/models/cv/video_frame_interpolation/rife/loss.py new file mode 100644 index 000000000..97f7644ca --- /dev/null +++ b/modelscope/models/cv/video_frame_interpolation/rife/loss.py @@ -0,0 +1,144 @@ +# The implementation here is modified based on ECCV2022-RIFE, +# originally MIT License, Copyright (c) Megvii Inc., +# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torchvision.models as models + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + + +class EPE(nn.Module): + + def __init__(self): + super(EPE, self).__init__() + + def forward(self, flow, gt, loss_mask): + loss_map = (flow - gt.detach())**2 + loss_map = (loss_map.sum(1, True) + 1e-6)**0.5 + return (loss_map * loss_mask) + + +class Ternary(nn.Module): + + def __init__(self): + super(Ternary, self).__init__() + patch_size = 7 + out_channels = patch_size * patch_size + self.w = np.eye(out_channels).reshape( + (patch_size, patch_size, 1, out_channels)) + self.w = np.transpose(self.w, (3, 2, 0, 1)) + self.w = torch.tensor(self.w).float().to(device) + + def transform(self, img): + patches = F.conv2d(img, self.w, padding=3, bias=None) + transf = patches - img + transf_norm = transf / torch.sqrt(0.81 + transf**2) + return transf_norm + + def rgb2gray(self, rgb): + r, g, b = rgb[:, 0:1, :, :], rgb[:, 1:2, :, :], rgb[:, 2:3, :, :] + gray = 0.2989 * r + 0.5870 * g + 0.1140 * b + return gray + + def hamming(self, t1, t2): + dist = (t1 - t2)**2 + dist_norm = torch.mean(dist / (0.1 + dist), 1, True) + return dist_norm + + def valid_mask(self, t, padding): + n, _, h, w = t.size() + inner = torch.ones(n, 1, h - 2 * padding, w - 2 * padding).type_as(t) + mask = F.pad(inner, [padding] * 4) + return mask + + def forward(self, img0, img1): + img0 = self.transform(self.rgb2gray(img0)) + img1 = self.transform(self.rgb2gray(img1)) + return self.hamming(img0, img1) * self.valid_mask(img0, 1) + + +class SOBEL(nn.Module): + + def __init__(self): + super(SOBEL, self).__init__() + self.kernelX = torch.tensor([ + [1, 0, -1], + [2, 0, -2], + [1, 0, -1], + ]).float() + self.kernelY = self.kernelX.clone().T + self.kernelX = self.kernelX.unsqueeze(0).unsqueeze(0).to(device) + self.kernelY = self.kernelY.unsqueeze(0).unsqueeze(0).to(device) + + def forward(self, pred, gt): + N, C, H, W = pred.shape[0], pred.shape[1], pred.shape[2], pred.shape[3] + img_stack = torch.cat( + [pred.reshape(N * C, 1, H, W), + gt.reshape(N * C, 1, H, W)], 0) + sobel_stack_x = F.conv2d(img_stack, self.kernelX, padding=1) + sobel_stack_y = F.conv2d(img_stack, self.kernelY, padding=1) + pred_X, gt_X = sobel_stack_x[:N * C], sobel_stack_x[N * C:] + pred_Y, gt_Y = sobel_stack_y[:N * C], sobel_stack_y[N * C:] + + L1X, L1Y = torch.abs(pred_X - gt_X), torch.abs(pred_Y - gt_Y) + loss = (L1X + L1Y) + return loss + + +class MeanShift(nn.Conv2d): + + def __init__(self, data_mean, data_std, data_range=1, norm=True): + c = len(data_mean) + super(MeanShift, self).__init__(c, c, kernel_size=1) + std = torch.Tensor(data_std) + self.weight.data = torch.eye(c).view(c, c, 1, 1) + if norm: + self.weight.data.div_(std.view(c, 1, 1, 1)) + self.bias.data = -1 * data_range * torch.Tensor(data_mean) + self.bias.data.div_(std) + else: + self.weight.data.mul_(std.view(c, 1, 1, 1)) + self.bias.data = data_range * torch.Tensor(data_mean) + self.requires_grad = False + + +class VGGPerceptualLoss(torch.nn.Module): + + def __init__(self, rank=0): + super(VGGPerceptualLoss, self).__init__() + # blocks = [] + pretrained = True + self.vgg_pretrained_features = models.vgg19( + pretrained=pretrained).features + self.normalize = MeanShift([0.485, 0.456, 0.406], + [0.229, 0.224, 0.225], + norm=True).cuda() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, X, Y, indices=None): + X = self.normalize(X) + Y = self.normalize(Y) + indices = [2, 7, 12, 21, 30] + weights = [1.0 / 2.6, 1.0 / 4.8, 1.0 / 3.7, 1.0 / 5.6, 10 / 1.5] + k = 0 + loss = 0 + for i in range(indices[-1]): + X = self.vgg_pretrained_features[i](X) + Y = self.vgg_pretrained_features[i](Y) + if (i + 1) in indices: + loss += weights[k] * (X - Y.detach()).abs().mean() * 0.1 + k += 1 + return loss + + +if __name__ == '__main__': + img0 = torch.zeros(3, 3, 256, 256).float().to(device) + img1 = torch.tensor(np.random.normal(0, 1, + (3, 3, 256, 256))).float().to(device) + ternary_loss = Ternary() + print(ternary_loss(img0, img1).shape) diff --git a/modelscope/models/cv/video_frame_interpolation/rife/warplayer.py b/modelscope/models/cv/video_frame_interpolation/rife/warplayer.py new file mode 100644 index 000000000..e4440e6f3 --- /dev/null +++ b/modelscope/models/cv/video_frame_interpolation/rife/warplayer.py @@ -0,0 +1,40 @@ +# The implementation here is modified based on ECCV2022-RIFE, +# originally MIT License, Copyright (c) Megvii Inc., +# and publicly available at https://github.com/megvii-research/ECCV2022-RIFE + +import torch +import torch.nn as nn + +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +backwarp_tenGrid = {} + + +def warp(tenInput, tenFlow): + k = (str(tenFlow.device), str(tenFlow.size())) + if k not in backwarp_tenGrid: + tenHorizontal = torch.linspace( + -1.0, 1.0, tenFlow.shape[3], device=device).view( + 1, 1, 1, tenFlow.shape[3]).expand(tenFlow.shape[0], -1, + tenFlow.shape[2], -1) + tenVertical = torch.linspace( + -1.0, 1.0, tenFlow.shape[2], + device=device).view(1, 1, tenFlow.shape[2], + 1).expand(tenFlow.shape[0], -1, -1, + tenFlow.shape[3]) + backwarp_tenGrid[k] = torch.cat([tenHorizontal, tenVertical], + 1).to(device) + + tenFlow = torch.cat( + [ + tenFlow[:, 0:1, :, :] / ((tenInput.shape[3] - 1.0) / 2.0), # no qa + tenFlow[:, 1:2, :, :] / ((tenInput.shape[2] - 1.0) / 2.0) + ], + 1) # no qa + + g = (backwarp_tenGrid[k] + tenFlow).permute(0, 2, 3, 1) + return torch.nn.functional.grid_sample( + input=tenInput, + grid=g, + mode='bilinear', + padding_mode='border', + align_corners=True) diff --git a/modelscope/models/cv/video_stabilization/DUT/config.py b/modelscope/models/cv/video_stabilization/DUT/config.py index 85c33bc3c..dde1d11fe 100644 --- a/modelscope/models/cv/video_stabilization/DUT/config.py +++ b/modelscope/models/cv/video_stabilization/DUT/config.py @@ -64,7 +64,7 @@ # scale strength weight __C.TRAIN.scale_com_strength = 100.0 -# non maximum supression threshold +# non maximum suppression threshold __C.TRAIN.NMS_THRESH = 0.0 # nms kernel size diff --git a/modelscope/models/cv/vidt/backbone.py b/modelscope/models/cv/vidt/backbone.py index 198ab498d..bcfcff9fb 100644 --- a/modelscope/models/cv/vidt/backbone.py +++ b/modelscope/models/cv/vidt/backbone.py @@ -440,7 +440,7 @@ def forward(self, x, mask_matrix, pos, cross_attn, cross_attn_mask): det = det + det_pos shifted_x = (shifted_x, cross_patch) else: - # it cross_attn is deativated, only [PATCH] and [DET] self-attention are performed + # it cross_attn is deactivated, only [PATCH] and [DET] self-attention are performed det = det + det_pos shifted_x = shifted_x @@ -961,7 +961,7 @@ def finetune_det(self, block.det_token_num = det_token_num block.det_pos_linear = nn.Linear(pos_dim, block.dim) - # neck-free model do not require downsamling at the last stage. + # neck-free model do not require downsampling at the last stage. if method == 'vidt_wo_neck': self.layers[-1].downsample = None diff --git a/modelscope/models/cv/vidt/fpn_fusion.py b/modelscope/models/cv/vidt/fpn_fusion.py index b48ba0feb..f0531c828 100644 --- a/modelscope/models/cv/vidt/fpn_fusion.py +++ b/modelscope/models/cv/vidt/fpn_fusion.py @@ -30,7 +30,7 @@ def forward(self, x_blocks): x_blocks = x_blocks - # preperation: channel reduction and normalization + # preparation: channel reduction and normalization for idx in range(self.n_block - 1, -1, -1): x_blocks[idx] = getattr(self.multi_scaler, f'layer_{idx}_rn')( x_blocks[idx]) @@ -111,8 +111,8 @@ def __init__(self, features (int): channel dim of the input feature activation: activation function to use bn: whether to use bn - expand: whether to exapnd feature or not - align_corners: wheter to use align_corners for interpolation + expand: whether to expand feature or not + align_corners: whether to use align_corners for interpolation """ super(FeatureFusionBlock, self).__init__() diff --git a/modelscope/models/multi_modal/clip/bert_tokenizer.py b/modelscope/models/multi_modal/clip/bert_tokenizer.py index 1ee715c91..36479d565 100644 --- a/modelscope/models/multi_modal/clip/bert_tokenizer.py +++ b/modelscope/models/multi_modal/clip/bert_tokenizer.py @@ -157,7 +157,7 @@ def whitespace_tokenize(text): class FullTokenizer(object): - """Runs end-to-end tokenziation.""" + """Runs end-to-end tokenization.""" def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) @@ -185,7 +185,7 @@ def convert_tokens_to_string(tokens, clean_up_tokenization_spaces=True): def clean_up_tokenization(out_string): """ Clean up a list of simple English tokenization artifacts - like spaces before punctuations and abreviated forms. + like spaces before punctuations and abbreviated forms. """ out_string = ( out_string.replace(' .', '.').replace(' ?', '?').replace( @@ -321,7 +321,7 @@ def _clean_text(self, text): class WordpieceTokenizer(object): - """Runs WordPiece tokenziation.""" + """Runs WordPiece tokenization.""" def __init__(self, vocab, unk_token='[UNK]', max_input_chars_per_word=200): self.vocab = vocab @@ -384,7 +384,7 @@ def tokenize(self, text): def _is_whitespace(char): """Checks whether `chars` is a whitespace character.""" - # \t, \n, and \r are technically contorl characters but we treat them + # \t, \n, and \r are technically control characters but we treat them # as whitespace since they are generally considered as such. if char == ' ' or char == '\t' or char == '\n' or char == '\r': return True diff --git a/modelscope/models/multi_modal/clip/configuration_bert.py b/modelscope/models/multi_modal/clip/configuration_bert.py index b75f5db89..b1a3966b2 100644 --- a/modelscope/models/multi_modal/clip/configuration_bert.py +++ b/modelscope/models/multi_modal/clip/configuration_bert.py @@ -37,7 +37,7 @@ class BertConfig(object): layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -46,7 +46,7 @@ class BertConfig(object): (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for + initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. layer_norm_eps: The epsilon used by LayerNorm. """ diff --git a/modelscope/models/multi_modal/clip/modeling_bert.py b/modelscope/models/multi_modal/clip/modeling_bert.py index 11c5c8338..7491d40ed 100644 --- a/modelscope/models/multi_modal/clip/modeling_bert.py +++ b/modelscope/models/multi_modal/clip/modeling_bert.py @@ -485,7 +485,7 @@ def forward(self, head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze( -1) # We can specify head_mask for each layer head_mask = head_mask.to(dtype=next(self.parameters( - )).dtype) # switch to fload if need + fp16 compatibility + )).dtype) # switch to float if need + fp16 compatibility else: head_mask = [None] * self.config.num_hidden_layers diff --git a/modelscope/models/multi_modal/clip_interrogator/model.py b/modelscope/models/multi_modal/clip_interrogator/model.py index a7e27cbd0..c04d7a9b2 100644 --- a/modelscope/models/multi_modal/clip_interrogator/model.py +++ b/modelscope/models/multi_modal/clip_interrogator/model.py @@ -1,4 +1,4 @@ -# This implementation is adopted from CLIP-Interrogator, made pubicly available under the MIT License at +# This implementation is adopted from CLIP-Interrogator, made publicly available under the MIT License at # https://github.com/pharmapsychotic/clip-interrogator/blob/main/clip_interrogator/clip_interrogator.py import hashlib diff --git a/modelscope/models/multi_modal/diffusion/structbert.py b/modelscope/models/multi_modal/diffusion/structbert.py index 0ca57fc4a..764cd0906 100644 --- a/modelscope/models/multi_modal/diffusion/structbert.py +++ b/modelscope/models/multi_modal/diffusion/structbert.py @@ -1,4 +1,4 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba inc. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team and Alibaba inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -79,7 +79,7 @@ def __init__(self, layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -88,7 +88,7 @@ def __init__(self, (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for + initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. """ self.vocab_size = vocab_size diff --git a/modelscope/models/multi_modal/diffusion/tokenizer.py b/modelscope/models/multi_modal/diffusion/tokenizer.py index 918498cd8..ef57b63c7 100644 --- a/modelscope/models/multi_modal/diffusion/tokenizer.py +++ b/modelscope/models/multi_modal/diffusion/tokenizer.py @@ -1,4 +1,4 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team and Alibaba inc. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team and Alibaba inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py b/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py index aaa588d30..091aeca57 100644 --- a/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py +++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/control_sd_lora.py @@ -1,6 +1,6 @@ # Copyright 2023-2024 The Alibaba Fundamental Vision Team Authors. All rights reserved. # The implementation is adopted from HighCWu, -# made pubicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA +# made publicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA import os from dataclasses import dataclass diff --git a/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py b/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py index 79ac2c33b..688378fc1 100644 --- a/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py +++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/efficient_stable_diffusion.py @@ -1,6 +1,6 @@ # Copyright 2023-2024 The Alibaba Fundamental Vision Team Authors. All rights reserved. # The implementation is adopted from HighCWu, -# made pubicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA +# made publicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA import os import os.path as osp from functools import partial diff --git a/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py b/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py index 306ca2b0c..8abd9735d 100644 --- a/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py +++ b/modelscope/models/multi_modal/efficient_diffusion_tuning/sd_lora.py @@ -1,6 +1,6 @@ # Copyright 2023-2024 The Alibaba Fundamental Vision Team Authors. All rights reserved. # The implementation is adopted from HighCWu, -# made pubicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA +# made publicly available under the Apache License 2.0 License at https://github.com/HighCWu/ControlLoRA import os from dataclasses import dataclass from typing import List, Tuple, Union diff --git a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py index c7ac3f947..d80c6f802 100644 --- a/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py +++ b/modelscope/models/multi_modal/mmr/dataloaders/rawvideo_util.py @@ -1,5 +1,5 @@ # The implementation is adopted from Huaishao Luo, -# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip import cv2 import numpy as np diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py index 743c049ad..6a54f0a5d 100644 --- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py +++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py @@ -1,5 +1,5 @@ # The implementation is adopted from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import os import random diff --git a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py index c2d96275d..48733de49 100644 --- a/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py +++ b/modelscope/models/multi_modal/mmr/models/dynamic_inverted_softmax.py @@ -1,5 +1,5 @@ # The implementation is adopted from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import numpy as np diff --git a/modelscope/models/multi_modal/mmr/models/module_clip.py b/modelscope/models/multi_modal/mmr/models/module_clip.py index 535017203..479ebfb31 100644 --- a/modelscope/models/multi_modal/mmr/models/module_clip.py +++ b/modelscope/models/multi_modal/mmr/models/module_clip.py @@ -1,5 +1,5 @@ # The implementation is adopated from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import hashlib import os diff --git a/modelscope/models/multi_modal/mmr/models/module_cross.py b/modelscope/models/multi_modal/mmr/models/module_cross.py index b958d5bca..f4327f8ca 100644 --- a/modelscope/models/multi_modal/mmr/models/module_cross.py +++ b/modelscope/models/multi_modal/mmr/models/module_cross.py @@ -1,5 +1,5 @@ # The implementation is adopated from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip from __future__ import absolute_import, division, print_function import logging diff --git a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py index 97ee7156a..5dc5ff6d9 100644 --- a/modelscope/models/multi_modal/mmr/models/tokenization_clip.py +++ b/modelscope/models/multi_modal/mmr/models/tokenization_clip.py @@ -1,5 +1,5 @@ # The implementation is adopted from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import gzip import html diff --git a/modelscope/models/multi_modal/mmr/models/until_module.py b/modelscope/models/multi_modal/mmr/models/until_module.py index 24e886b0f..fcc94dfe5 100644 --- a/modelscope/models/multi_modal/mmr/models/until_module.py +++ b/modelscope/models/multi_modal/mmr/models/until_module.py @@ -1,4 +1,4 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/multi_modal/mplug/predictor.py b/modelscope/models/multi_modal/mplug/predictor.py index 6375d1d7e..b6165e655 100755 --- a/modelscope/models/multi_modal/mplug/predictor.py +++ b/modelscope/models/multi_modal/mplug/predictor.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py index 76ab11708..e6e7d9ac9 100644 --- a/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py +++ b/modelscope/models/multi_modal/ofa_for_text_to_image_synthesis_model.py @@ -6,10 +6,10 @@ import json import numpy as np +import packaging import torch import torch.cuda from PIL import Image -from pkg_resources import packaging from taming.models.vqgan import GumbelVQ, VQModel from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize, ToTensor) diff --git a/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py b/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py index c7ac3f947..d80c6f802 100644 --- a/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py +++ b/modelscope/models/multi_modal/prost/dataloaders/rawvideo_util.py @@ -1,5 +1,5 @@ # The implementation is adopted from Huaishao Luo, -# made pubicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under the MIT License at https://github.com/ArrowLuo/CLIP4Clip import cv2 import numpy as np diff --git a/modelscope/models/multi_modal/prost/models/module_clip.py b/modelscope/models/multi_modal/prost/models/module_clip.py index c5aaa1e52..b340822ce 100644 --- a/modelscope/models/multi_modal/prost/models/module_clip.py +++ b/modelscope/models/multi_modal/prost/models/module_clip.py @@ -1,5 +1,5 @@ # The implementation is adopated from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import hashlib import os diff --git a/modelscope/models/multi_modal/prost/models/module_cross.py b/modelscope/models/multi_modal/prost/models/module_cross.py index fae8e904b..ccfd50e6a 100644 --- a/modelscope/models/multi_modal/prost/models/module_cross.py +++ b/modelscope/models/multi_modal/prost/models/module_cross.py @@ -51,7 +51,7 @@ def __init__(self, layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -60,7 +60,7 @@ def __init__(self, (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `CrossModel`. - initializer_range: The sttdev of the truncated_normal_initializer for + initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. """ if isinstance(vocab_size_or_config_json_file, str): diff --git a/modelscope/models/multi_modal/prost/models/prost_model.py b/modelscope/models/multi_modal/prost/models/prost_model.py index 022903cb7..f3b5947bb 100644 --- a/modelscope/models/multi_modal/prost/models/prost_model.py +++ b/modelscope/models/multi_modal/prost/models/prost_model.py @@ -1,5 +1,5 @@ # The implementation is adopted from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import os import random diff --git a/modelscope/models/multi_modal/prost/models/tokenization_clip.py b/modelscope/models/multi_modal/prost/models/tokenization_clip.py index 97ee7156a..5dc5ff6d9 100644 --- a/modelscope/models/multi_modal/prost/models/tokenization_clip.py +++ b/modelscope/models/multi_modal/prost/models/tokenization_clip.py @@ -1,5 +1,5 @@ # The implementation is adopted from the CLIP4Clip implementation, -# made pubicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip +# made publicly available under Apache License, Version 2.0 at https://github.com/ArrowLuo/CLIP4Clip import gzip import html diff --git a/modelscope/models/multi_modal/prost/models/until_config.py b/modelscope/models/multi_modal/prost/models/until_config.py index dc9753d3e..8dc56375a 100755 --- a/modelscope/models/multi_modal/prost/models/until_config.py +++ b/modelscope/models/multi_modal/prost/models/until_config.py @@ -1,4 +1,4 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/multi_modal/prost/models/until_module.py b/modelscope/models/multi_modal/prost/models/until_module.py index 20afc2c3b..c072445ad 100644 --- a/modelscope/models/multi_modal/prost/models/until_module.py +++ b/modelscope/models/multi_modal/prost/models/until_module.py @@ -1,4 +1,4 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/multi_modal/video_synthesis/autoencoder.py b/modelscope/models/multi_modal/video_synthesis/autoencoder.py index 7885f2626..34bcee1b0 100644 --- a/modelscope/models/multi_modal/video_synthesis/autoencoder.py +++ b/modelscope/models/multi_modal/video_synthesis/autoencoder.py @@ -1,5 +1,5 @@ # Part of the implementation is borrowed and modified from latent-diffusion, -# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# publicly available at https://github.com/CompVis/latent-diffusion. # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import numpy as np diff --git a/modelscope/models/multi_modal/video_synthesis/diffusion.py b/modelscope/models/multi_modal/video_synthesis/diffusion.py index 138fddae3..2c4d4f6d2 100644 --- a/modelscope/models/multi_modal/video_synthesis/diffusion.py +++ b/modelscope/models/multi_modal/video_synthesis/diffusion.py @@ -1,5 +1,5 @@ # Part of the implementation is borrowed and modified from latent-diffusion, -# publicly avaialbe at https://github.com/CompVis/latent-diffusion. +# publicly available at https://github.com/CompVis/latent-diffusion. # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import torch diff --git a/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py b/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py index 0ec66069f..76f30580d 100644 --- a/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py +++ b/modelscope/models/multi_modal/video_synthesis/text_to_video_synthesis_model.py @@ -58,7 +58,7 @@ def __init__(self, model_dir, *args, **kwargs): `True`. """ super().__init__(model_dir=model_dir, *args, **kwargs) - self.device = torch.device('cuda') if torch.cuda.is_available() \ + self.device = torch.device(kwargs.get('device', 'cuda')) if torch.cuda.is_available() \ else torch.device('cpu') self.config = Config.from_file( osp.join(model_dir, ModelFile.CONFIGURATION)) diff --git a/modelscope/models/multi_modal/video_synthesis/unet_sd.py b/modelscope/models/multi_modal/video_synthesis/unet_sd.py index f3c764eb2..779320e28 100644 --- a/modelscope/models/multi_modal/video_synthesis/unet_sd.py +++ b/modelscope/models/multi_modal/video_synthesis/unet_sd.py @@ -1,5 +1,5 @@ # Part of the implementation is borrowed and modified from stable-diffusion, -# publicly avaialbe at https://github.com/Stability-AI/stablediffusion. +# publicly available at https://github.com/Stability-AI/stablediffusion. # Copyright 2021-2022 The Alibaba Fundamental Vision Team Authors. All rights reserved. import math diff --git a/modelscope/models/nlp/bart/text_error_correction.py b/modelscope/models/nlp/bart/text_error_correction.py index 97c3a7a9e..09b867c4c 100644 --- a/modelscope/models/nlp/bart/text_error_correction.py +++ b/modelscope/models/nlp/bart/text_error_correction.py @@ -82,5 +82,5 @@ def forward(self, input: Dict[str, Dict]) -> TextErrorCorrectionOutput: batch_preds = [] for i in range(batch_size): # get 1-best List[Tensor] - batch_preds.append(translations[i][0]['tokens']) + batch_preds.append(translations[i][0]['tokens'].tolist()) return TextErrorCorrectionOutput(predictions=batch_preds) diff --git a/modelscope/models/nlp/dgds/backbone.py b/modelscope/models/nlp/dgds/backbone.py index 17e3c5746..9acf3937f 100644 --- a/modelscope/models/nlp/dgds/backbone.py +++ b/modelscope/models/nlp/dgds/backbone.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/fid_plug/backbone.py b/modelscope/models/nlp/fid_plug/backbone.py index 5dcddcc15..f86f35fe6 100644 --- a/modelscope/models/nlp/fid_plug/backbone.py +++ b/modelscope/models/nlp/fid_plug/backbone.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/llama/__init__.py b/modelscope/models/nlp/llama/__init__.py index d5b6fd19e..9de2d294f 100644 --- a/modelscope/models/nlp/llama/__init__.py +++ b/modelscope/models/nlp/llama/__init__.py @@ -1,8 +1,8 @@ # Copyright (c) Alibaba, Inc. and its affiliates. from typing import TYPE_CHECKING -from transformers.models.llama import (LlamaConfig, LlamaTokenizer, - LlamaTokenizerFast) +from transformers import LlamaTokenizer +from transformers.models.llama import LlamaConfig, LlamaTokenizerFast from modelscope.utils.import_utils import LazyImportModule diff --git a/modelscope/models/nlp/llama/backbone.py b/modelscope/models/nlp/llama/backbone.py index 0ac5bf5cc..dd22da016 100755 --- a/modelscope/models/nlp/llama/backbone.py +++ b/modelscope/models/nlp/llama/backbone.py @@ -49,6 +49,7 @@ def _instantiate(cls, **kwargs): The loaded model, which is initialized by transformers.PreTrainedModel.from_pretrained """ model_dir = kwargs.pop('model_dir', None) + device = kwargs.pop('device', None) if model_dir is None: config = LlamaConfig(**kwargs) model = cls(config) @@ -56,7 +57,8 @@ def _instantiate(cls, **kwargs): model = super(MsModelMixin, cls).from_pretrained( pretrained_model_name_or_path=model_dir, **kwargs) model.model_dir = model_dir - return model + return model if 'device_map' in kwargs \ + or device is None else model.to(device) class LlamaPreTrainedModel(MsModelMixin, LlamaPreTrainedModelHF, TorchModel): diff --git a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py index 079cfd46d..3f717298b 100644 --- a/modelscope/models/nlp/mglm/mglm_for_text_summarization.py +++ b/modelscope/models/nlp/mglm/mglm_for_text_summarization.py @@ -58,6 +58,7 @@ def setup_model(args): if args.load_pretrained is not None: args.no_load_optim = True args.load = args.load_pretrained + args.no_load_rng = True _ = load_checkpoint(model, None, None, args) return model diff --git a/modelscope/models/nlp/mglm/model/modeling_bert.py b/modelscope/models/nlp/mglm/model/modeling_bert.py index 28b5cd1ea..8d989820e 100644 --- a/modelscope/models/nlp/mglm/model/modeling_bert.py +++ b/modelscope/models/nlp/mglm/model/modeling_bert.py @@ -1,4 +1,4 @@ -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # @@ -203,7 +203,7 @@ def __init__(self, layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -212,7 +212,7 @@ def __init__(self, (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `BertModel`. - initializer_range: The sttdev of the truncated_normal_initializer for + initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. """ if isinstance(vocab_size_or_config_json_file, str): @@ -743,7 +743,7 @@ def forward(self, sequence_output, pooled_output): class PreTrainedBertModel(nn.Module): """ An abstract class to handle weights initialization and - a simple interface for dowloading and loading pretrained models. + a simple interface for downloading and loading pretrained models. """ def __init__(self, config, *inputs, **kwargs): @@ -799,7 +799,7 @@ def from_pretrained(cls, . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models + state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ # noqa diff --git a/modelscope/models/nlp/mglm/model/transformer.py b/modelscope/models/nlp/mglm/model/transformer.py index da944c768..c807de87d 100644 --- a/modelscope/models/nlp/mglm/model/transformer.py +++ b/modelscope/models/nlp/mglm/model/transformer.py @@ -155,7 +155,7 @@ class ParallelSelfAttention(torch.nn.Module): """Parallel self-attention layer for GPT2. Self-attention layer takes input with size [b, s, h] where b is - the batch size, s is the sequence lenght, and h is the hidden size + the batch size, s is the sequence length, and h is the hidden size and creates output of the same size. Arguments: hidden_size: total hidden size of the layer (h). diff --git a/modelscope/models/nlp/palm_v2/text_generation.py b/modelscope/models/nlp/palm_v2/text_generation.py index cd3ecdaf2..a21058fde 100644 --- a/modelscope/models/nlp/palm_v2/text_generation.py +++ b/modelscope/models/nlp/palm_v2/text_generation.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/plug/backbone.py b/modelscope/models/nlp/plug/backbone.py index 37714ed77..0442414cb 100644 --- a/modelscope/models/nlp/plug/backbone.py +++ b/modelscope/models/nlp/plug/backbone.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO NLP Team Authors. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/modelscope/models/nlp/plug_mental/backbone.py b/modelscope/models/nlp/plug_mental/backbone.py index e8531f529..918fcdbd9 100755 --- a/modelscope/models/nlp/plug_mental/backbone.py +++ b/modelscope/models/nlp/plug_mental/backbone.py @@ -1031,7 +1031,7 @@ def forward(self, head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - embedding_output, orignal_embeds = self.embeddings( + embedding_output, original_embeds = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, @@ -1065,7 +1065,7 @@ def forward(self, if not return_dict: return (sequence_output, - pooled_output) + encoder_outputs[1:] + (orignal_embeds, ) + pooled_output) + encoder_outputs[1:] + (original_embeds, ) return AttentionBackboneModelOutputWithEmbedding( last_hidden_state=sequence_output, @@ -1074,4 +1074,4 @@ def forward(self, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, - embedding_output=orignal_embeds) + embedding_output=original_embeds) diff --git a/modelscope/models/nlp/space_T_cn/backbone.py b/modelscope/models/nlp/space_T_cn/backbone.py index b1df58bad..42df1b12b 100644 --- a/modelscope/models/nlp/space_T_cn/backbone.py +++ b/modelscope/models/nlp/space_T_cn/backbone.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -656,7 +656,7 @@ def from_pretrained(cls, . `bert_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a BertForPreTraining instance cache_dir: an optional path to a folder in which the pre-trained models will be cached. - state_dict: an optional state dictionnary (collections.OrderedDict object) + state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) diff --git a/modelscope/models/nlp/space_T_cn/configuration.py b/modelscope/models/nlp/space_T_cn/configuration.py index e698b310d..0d39c90ed 100644 --- a/modelscope/models/nlp/space_T_cn/configuration.py +++ b/modelscope/models/nlp/space_T_cn/configuration.py @@ -1,5 +1,5 @@ # Copyright 2021-2022 The Alibaba DAMO Team Authors. All rights reserved. -# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -52,7 +52,7 @@ def __init__(self, layer in the Transformer encoder. hidden_act: The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu", "relu" and "swish" are supported. - hidden_dropout_prob: The dropout probabilitiy for all fully connected + hidden_dropout_prob: The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. attention_probs_dropout_prob: The dropout ratio for the attention probabilities. @@ -60,7 +60,7 @@ def __init__(self, ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `SpaceTCnConfig`. - initializer_range: The sttdev of the truncated_normal_initializer for + initializer_range: The stdev of the truncated_normal_initializer for initializing all weight matrices. """ if isinstance(vocab_size_or_config_json_file, str): diff --git a/modelscope/models/nlp/structbert/backbone.py b/modelscope/models/nlp/structbert/backbone.py index 58d324a8d..d1998e984 100755 --- a/modelscope/models/nlp/structbert/backbone.py +++ b/modelscope/models/nlp/structbert/backbone.py @@ -881,7 +881,7 @@ def forward(self, head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) - embedding_output, orignal_embeds = self.embeddings( + embedding_output, original_embeds = self.embeddings( input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, @@ -907,7 +907,7 @@ def forward(self, if not return_dict: return (sequence_output, - pooled_output) + encoder_outputs[1:] + (orignal_embeds, ) + pooled_output) + encoder_outputs[1:] + (original_embeds, ) return AttentionBackboneModelOutputWithEmbedding( last_hidden_state=sequence_output, @@ -916,4 +916,4 @@ def forward(self, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, cross_attentions=encoder_outputs.cross_attentions, - embedding_output=orignal_embeds) + embedding_output=original_embeds) diff --git a/modelscope/models/nlp/structbert/faq_question_answering.py b/modelscope/models/nlp/structbert/faq_question_answering.py index bc22ab617..6c05bcff8 100644 --- a/modelscope/models/nlp/structbert/faq_question_answering.py +++ b/modelscope/models/nlp/structbert/faq_question_answering.py @@ -375,6 +375,8 @@ def sentence_embedding(self, inputs: Dict[str, Tensor]): input_ids = torch.IntTensor(input_ids) if not isinstance(input_mask, Tensor): input_mask = torch.IntTensor(input_mask) + input_ids = input_ids.to(self.bert.device) + input_mask = input_mask.to(self.bert.device) rst = self.bert(input_ids, input_mask) last_hidden_states = rst.last_hidden_state if len(input_mask.shape) == 2: diff --git a/modelscope/msdatasets/__init__.py b/modelscope/msdatasets/__init__.py index 70200e446..534a05008 100644 --- a/modelscope/msdatasets/__init__.py +++ b/modelscope/msdatasets/__init__.py @@ -1,2 +1,2 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from .ms_dataset import MsDataset +from modelscope.msdatasets.ms_dataset import MsDataset diff --git a/modelscope/msdatasets/context/dataset_context_config.py b/modelscope/msdatasets/context/dataset_context_config.py index 48124d786..fce262b02 100644 --- a/modelscope/msdatasets/context/dataset_context_config.py +++ b/modelscope/msdatasets/context/dataset_context_config.py @@ -17,7 +17,8 @@ def __init__(self, dataset_name: Union[str, list], namespace: str, data_files: Union[str, Sequence[str], Mapping[str, Union[str, Sequence[str]]]], download_mode: DownloadMode, cache_root_dir: str, - use_streaming: bool, stream_batch_size: int, **kwargs): + use_streaming: bool, stream_batch_size: int, + trust_remote_code: bool, **kwargs): self._download_config = None self._data_meta_config = None @@ -44,6 +45,7 @@ def __init__(self, dataset_name: Union[str, list], namespace: str, self.use_streaming = use_streaming self.stream_batch_size = stream_batch_size self.download_virgo_files: bool = False + self.trust_remote_code: bool = trust_remote_code @property def config_kwargs(self) -> dict: diff --git a/modelscope/msdatasets/data_loader/data_loader.py b/modelscope/msdatasets/data_loader/data_loader.py index f29acc8fc..920744499 100644 --- a/modelscope/msdatasets/data_loader/data_loader.py +++ b/modelscope/msdatasets/data_loader/data_loader.py @@ -127,6 +127,7 @@ def _prepare_and_download(self) -> None: cache_dir = self.dataset_context_config.cache_root_dir download_mode = self.dataset_context_config.download_mode input_kwargs = self.dataset_context_config.config_kwargs + trust_remote_code = self.dataset_context_config.trust_remote_code if self.builder is None and not dataset_py_script: raise f'meta-file: {dataset_name}.py not found on the modelscope hub.' @@ -141,7 +142,7 @@ def _prepare_and_download(self) -> None: data_files=data_files, cache_dir=cache_dir, download_mode=download_mode.value, - ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_kwargs) else: self.dataset = self.data_files_manager.fetch_data_files( diff --git a/modelscope/msdatasets/data_loader/data_loader_manager.py b/modelscope/msdatasets/data_loader/data_loader_manager.py index 0dec5d89c..a9e58b7c4 100644 --- a/modelscope/msdatasets/data_loader/data_loader_manager.py +++ b/modelscope/msdatasets/data_loader/data_loader_manager.py @@ -57,6 +57,7 @@ def load_dataset(self, data_loader_type: enum.Enum): cache_root_dir = self.dataset_context_config.cache_root_dir download_mode = self.dataset_context_config.download_mode use_streaming = self.dataset_context_config.use_streaming + trust_remote_code = self.dataset_context_config.trust_remote_code input_config_kwargs = self.dataset_context_config.config_kwargs # load local single file @@ -81,7 +82,7 @@ def load_dataset(self, data_loader_type: enum.Enum): cache_dir=cache_root_dir, download_mode=download_mode.value, streaming=use_streaming, - ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_config_kwargs) raise f'Expected local data loader type: {LocalDataLoaderType.HF_DATA_LOADER.value}.' @@ -105,6 +106,7 @@ def load_dataset(self, data_loader_type: enum.Enum): download_mode_val = self.dataset_context_config.download_mode.value use_streaming = self.dataset_context_config.use_streaming input_config_kwargs = self.dataset_context_config.config_kwargs + trust_remote_code = self.dataset_context_config.trust_remote_code # To use the huggingface data loader if data_loader_type == RemoteDataLoaderType.HF_DATA_LOADER: @@ -117,7 +119,7 @@ def load_dataset(self, data_loader_type: enum.Enum): data_files=data_files, download_mode=download_mode_val, streaming=use_streaming, - ignore_verifications=True, + trust_remote_code=trust_remote_code, **input_config_kwargs) # download statistics self.api.dataset_download_statistics( diff --git a/modelscope/msdatasets/dataset_cls/dataset.py b/modelscope/msdatasets/dataset_cls/dataset.py index f9ffd9a72..9c1c75841 100644 --- a/modelscope/msdatasets/dataset_cls/dataset.py +++ b/modelscope/msdatasets/dataset_cls/dataset.py @@ -149,6 +149,7 @@ def _download_item(self, item): if isinstance(ex_cache_path, str): ex_cache_path = [ex_cache_path] ret[k] = ex_cache_path + ret[k.strip(':FILE')] = v except Exception as e: logger.error(e) diff --git a/modelscope/msdatasets/download/dataset_builder.py b/modelscope/msdatasets/download/dataset_builder.py index 0c5c41543..845636682 100644 --- a/modelscope/msdatasets/download/dataset_builder.py +++ b/modelscope/msdatasets/download/dataset_builder.py @@ -330,6 +330,7 @@ def __init__(self, dataset_context_config: DatasetContextConfig): super().__init__( cache_dir=self.cache_build_dir, + dataset_name=self.dataset_name, config_name=self.namespace, hash=sub_dir_hash, data_files=None, # TODO: self.meta_data_files, diff --git a/modelscope/msdatasets/download/download_config.py b/modelscope/msdatasets/download/download_config.py index 11118f85c..0fc95cd9a 100644 --- a/modelscope/msdatasets/download/download_config.py +++ b/modelscope/msdatasets/download/download_config.py @@ -6,16 +6,18 @@ class DataDownloadConfig(DownloadConfig): + """ + Extends `DownloadConfig` with additional attributes for data download. + """ - def __init__(self): - self.dataset_name: Optional[str] = None - self.namespace: Optional[str] = None - self.version: Optional[str] = None - self.split: Optional[Union[str, list]] = None - self.data_dir: Optional[str] = None - self.oss_config: Optional[dict] = {} - self.meta_args_map: Optional[dict] = {} - self.num_proc: int = 4 + dataset_name: Optional[str] = None + namespace: Optional[str] = None + version: Optional[str] = None + split: Optional[Union[str, list]] = None + data_dir: Optional[str] = None + oss_config: Optional[dict] = {} + meta_args_map: Optional[dict] = {} + num_proc: int = 4 def copy(self) -> 'DataDownloadConfig': return self diff --git a/modelscope/msdatasets/download/download_manager.py b/modelscope/msdatasets/download/download_manager.py index 4799171aa..5e36cdce6 100644 --- a/modelscope/msdatasets/download/download_manager.py +++ b/modelscope/msdatasets/download/download_manager.py @@ -36,6 +36,11 @@ def _download(self, url_or_filename: str, return cached_path( url_or_filename, download_config=download_config) + def _download_single(self, url_or_filename: str, + download_config: DataDownloadConfig) -> str: + # Note: _download_single function is available for datasets>=2.19.0 + return self._download(url_or_filename, download_config) + class DataStreamingDownloadManager(StreamingDownloadManager): """The data streaming download manager.""" @@ -62,3 +67,7 @@ def _download(self, url_or_filename: str) -> str: else: return cached_path( url_or_filename, download_config=self.download_config) + + def _download_single(self, url_or_filename: str) -> str: + # Note: _download_single function is available for datasets>=2.19.0 + return self._download(url_or_filename) diff --git a/modelscope/msdatasets/meta/data_meta_manager.py b/modelscope/msdatasets/meta/data_meta_manager.py index 3f1e65726..e5a57f026 100644 --- a/modelscope/msdatasets/meta/data_meta_manager.py +++ b/modelscope/msdatasets/meta/data_meta_manager.py @@ -92,6 +92,10 @@ def fetch_meta_files(self) -> None: data_meta_config.meta_cache_dir = meta_cache_dir data_meta_config.dataset_scripts = dataset_scripts data_meta_config.dataset_formation = dataset_formation + if '.py' in dataset_scripts: + tmp_py_scripts = dataset_scripts['.py'] + if len(tmp_py_scripts) > 0: + data_meta_config.dataset_py_script = tmp_py_scripts[0] # Set dataset_context_config self.dataset_context_config.data_meta_config = data_meta_config @@ -112,7 +116,9 @@ def parse_dataset_structure(self): dataset_py_script = None dataset_scripts = data_meta_config.dataset_scripts if not dataset_scripts or len(dataset_scripts) == 0: - raise 'Cannot find dataset meta-files, please fetch meta from modelscope hub.' + raise FileNotFoundError( + 'Cannot find dataset meta-files, please fetch meta from modelscope hub.' + ) if '.py' in dataset_scripts: dataset_py_script = dataset_scripts['.py'][0] for json_path in dataset_scripts['.json']: @@ -121,7 +127,9 @@ def parse_dataset_structure(self): dataset_json = json.load(dataset_json_file) break if not dataset_json and not dataset_py_script: - raise f'File {dataset_name}.json and {dataset_name}.py not found, please specify at least one meta-file.' + raise FileNotFoundError( + f'File {dataset_name}.json and {dataset_name}.py not found,' + 'please specify at least one meta-file.') # Parse meta and get dataset structure if dataset_py_script: diff --git a/modelscope/msdatasets/ms_dataset.py b/modelscope/msdatasets/ms_dataset.py index b720ada62..899142adc 100644 --- a/modelscope/msdatasets/ms_dataset.py +++ b/modelscope/msdatasets/ms_dataset.py @@ -13,7 +13,6 @@ from modelscope.hub.repository import DatasetRepository from modelscope.msdatasets.context.dataset_context_config import \ DatasetContextConfig -from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader from modelscope.msdatasets.data_loader.data_loader_manager import ( LocalDataLoaderManager, LocalDataLoaderType, RemoteDataLoaderManager, RemoteDataLoaderType) @@ -22,14 +21,15 @@ from modelscope.msdatasets.dataset_cls.custom_datasets.builder import \ build_custom_dataset from modelscope.msdatasets.utils.delete_utils import DatasetDeleteManager +from modelscope.msdatasets.utils.hf_datasets_util import load_dataset_with_ctx from modelscope.msdatasets.utils.upload_utils import DatasetUploadManager from modelscope.preprocessors import build_preprocessor from modelscope.utils.config import Config, ConfigDict from modelscope.utils.config_ds import MS_DATASETS_CACHE from modelscope.utils.constant import (DEFAULT_DATASET_NAMESPACE, DEFAULT_DATASET_REVISION, ConfigFields, - DownloadMode, Hubs, ModeKeys, Tasks, - UploadMode, VirgoDatasetConfig) + DatasetFormations, DownloadMode, Hubs, + ModeKeys, Tasks, UploadMode) from modelscope.utils.import_utils import is_tf_available, is_torch_available from modelscope.utils.logger import get_logger @@ -167,6 +167,8 @@ def load( stream_batch_size: Optional[int] = 1, custom_cfg: Optional[Config] = Config(), token: Optional[str] = None, + dataset_info_only: Optional[bool] = False, + trust_remote_code: Optional[bool] = True, **config_kwargs, ) -> Union[dict, 'MsDataset', NativeIterableDataset]: """Load a MsDataset from the ModelScope Hub, Hugging Face Hub, urls, or a local dataset. @@ -196,6 +198,8 @@ def load( custom_cfg (str, Optional): Model configuration, this can be used for custom datasets. see https://modelscope.cn/docs/Configuration%E8%AF%A6%E8%A7%A3 token (str, Optional): SDK token of ModelScope. + dataset_info_only (bool, Optional): If set to True, only return the dataset config and info (dict). + trust_remote_code (bool, Optional): If set to True, trust the remote code. **config_kwargs (additional keyword arguments): Keyword arguments to be passed Returns: @@ -248,6 +252,7 @@ def load( cache_root_dir=cache_dir, use_streaming=use_streaming, stream_batch_size=stream_batch_size, + trust_remote_code=trust_remote_code, **config_kwargs) # Load from local disk @@ -266,32 +271,66 @@ def load( return dataset_inst # Load from the huggingface hub elif hub == Hubs.huggingface: - dataset_inst = RemoteDataLoaderManager( - dataset_context_config).load_dataset( - RemoteDataLoaderType.HF_DATA_LOADER) - dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target) - if isinstance(dataset_inst, MsDataset): - dataset_inst._dataset_context_config = dataset_context_config - if custom_cfg: - dataset_inst.to_custom_dataset( - custom_cfg=custom_cfg, **config_kwargs) - dataset_inst.is_custom = True - return dataset_inst + from datasets import load_dataset + return load_dataset( + dataset_name, + name=subset_name, + split=split, + streaming=use_streaming, + download_mode=download_mode.value, + trust_remote_code=trust_remote_code, + **config_kwargs) + # Load from the modelscope hub elif hub == Hubs.modelscope: - remote_dataloader_manager = RemoteDataLoaderManager( - dataset_context_config) - dataset_inst = remote_dataloader_manager.load_dataset( - RemoteDataLoaderType.MS_DATA_LOADER) - dataset_inst = MsDataset.to_ms_dataset(dataset_inst, target=target) - if isinstance(dataset_inst, MsDataset): - dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config - if custom_cfg: - dataset_inst.to_custom_dataset( - custom_cfg=custom_cfg, **config_kwargs) - dataset_inst.is_custom = True - return dataset_inst + + # Get dataset type from ModelScope Hub; dataset_type->4: General Dataset + from modelscope.hub.api import HubApi + _api = HubApi() + dataset_id_on_hub, dataset_type = _api.get_dataset_id_and_type( + dataset_name=dataset_name, namespace=namespace) + + # Load from the ModelScope Hub for type=4 (general) + if str(dataset_type) == str(DatasetFormations.general.value): + + with load_dataset_with_ctx( + path=namespace + '/' + dataset_name, + name=subset_name, + data_dir=data_dir, + data_files=data_files, + split=split, + cache_dir=cache_dir, + features=None, + download_config=None, + download_mode=download_mode.value, + revision=version, + token=token, + streaming=use_streaming, + dataset_info_only=dataset_info_only, + trust_remote_code=trust_remote_code, + **config_kwargs) as dataset_res: + + return dataset_res + + else: + + remote_dataloader_manager = RemoteDataLoaderManager( + dataset_context_config) + dataset_inst = remote_dataloader_manager.load_dataset( + RemoteDataLoaderType.MS_DATA_LOADER) + dataset_inst = MsDataset.to_ms_dataset( + dataset_inst, target=target) + if isinstance(dataset_inst, MsDataset): + dataset_inst._dataset_context_config = remote_dataloader_manager.dataset_context_config + if custom_cfg: + dataset_inst.to_custom_dataset( + custom_cfg=custom_cfg, **config_kwargs) + dataset_inst.is_custom = True + return dataset_inst + elif hub == Hubs.virgo: + from modelscope.msdatasets.data_loader.data_loader import VirgoDownloader + from modelscope.utils.constant import VirgoDatasetConfig # Rewrite the namespace, version and cache_dir for virgo dataset. if namespace == DEFAULT_DATASET_NAMESPACE: dataset_context_config.namespace = VirgoDatasetConfig.default_virgo_namespace @@ -323,6 +362,10 @@ def upload( chunksize: Optional[int] = 1, filter_hidden_files: Optional[bool] = True, upload_mode: Optional[UploadMode] = UploadMode.OVERWRITE) -> None: + r""" + @deprecated + This method is deprecated and may be removed in future releases, please use git command line instead. + """ """Upload dataset file or directory to the ModelScope Hub. Please log in to the ModelScope Hub first. Args: @@ -346,6 +389,10 @@ def upload( None """ + warnings.warn( + 'upload is deprecated, please use git command line to upload the dataset.', + DeprecationWarning) + if not object_name: raise ValueError('object_name cannot be empty!') @@ -393,6 +440,10 @@ def clone_meta(dataset_work_dir: str, None """ + warnings.warn( + 'upload is deprecated, please use git command line to upload the dataset.', + DeprecationWarning) + _repo = DatasetRepository( repo_work_dir=dataset_work_dir, dataset_id=dataset_id, diff --git a/modelscope/msdatasets/utils/dataset_utils.py b/modelscope/msdatasets/utils/dataset_utils.py index b40915eb8..960693c17 100644 --- a/modelscope/msdatasets/utils/dataset_utils.py +++ b/modelscope/msdatasets/utils/dataset_utils.py @@ -195,7 +195,7 @@ def get_dataset_files(subset_split_into: dict, for split, info in subset_split_into.items(): custom_type_map[split] = info.get('custom', '') - meta_map[split] = modelscope_api.get_dataset_file_url( + meta_map[split] = modelscope_api.get_dataset_file_url_origin( info.get('meta', ''), dataset_name, namespace, revision) if info.get('file'): file_map[split] = info['file'] @@ -212,7 +212,10 @@ def get_dataset_files(subset_split_into: dict, csv_delimiter = context_config.config_kwargs.get('delimiter', ',') csv_df = pd.read_csv( - meta_csv_file_path, iterator=False, delimiter=csv_delimiter) + meta_csv_file_path, + iterator=False, + delimiter=csv_delimiter, + escapechar='\\') target_col = csv_df.columns[csv_df.columns.str.contains( ':FILE')].to_list() if len(target_col) == 0: diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py new file mode 100644 index 000000000..8bd768dc1 --- /dev/null +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -0,0 +1,1381 @@ +# noqa: isort:skip_file, yapf: disable +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. +import importlib +import contextlib +import os +import warnings +from functools import partial +from pathlib import Path +from typing import Dict, Iterable, List, Mapping, Optional, Sequence, Union, Tuple, Literal + +from urllib.parse import urlencode + +import requests +from datasets import (BuilderConfig, Dataset, DatasetBuilder, DatasetDict, + DownloadConfig, DownloadManager, DownloadMode, Features, + IterableDataset, IterableDatasetDict, Split, + VerificationMode, Version, config, data_files) +from datasets.data_files import ( + FILES_TO_IGNORE, DataFilesDict, DataFilesList, EmptyDatasetError, + _get_data_files_patterns, _is_inside_unrequested_special_dir, + _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir, get_metadata_patterns, sanitize_patterns) +from datasets.download.streaming_download_manager import ( + _prepare_path_and_storage_options, xbasename, xjoin) +from datasets.exceptions import DataFilesNotFoundError, DatasetNotFoundError +from datasets.info import DatasetInfosDict +from datasets.load import ( + ALL_ALLOWED_EXTENSIONS, BuilderConfigsParameters, + CachedDatasetModuleFactory, DatasetModule, + HubDatasetModuleFactoryWithoutScript, + HubDatasetModuleFactoryWithParquetExport, + HubDatasetModuleFactoryWithScript, LocalDatasetModuleFactoryWithoutScript, + LocalDatasetModuleFactoryWithScript, PackagedDatasetModuleFactory, + create_builder_configs_from_metadata_configs, get_dataset_builder_class, + import_main_class, infer_module_for_data_files, files_to_hash, + _get_importable_file_path, resolve_trust_remote_code, _create_importable_file, _load_importable_file, + init_dynamic_modules) +from datasets.naming import camelcase_to_snakecase +from datasets.packaged_modules import (_EXTENSION_TO_MODULE, + _MODULE_SUPPORTS_METADATA, + _MODULE_TO_EXTENSIONS, + _PACKAGED_DATASETS_MODULES) +from datasets.utils import file_utils +from datasets.utils.file_utils import (OfflineModeIsEnabled, + _raise_if_offline_mode_is_enabled, + cached_path, is_local_path, + is_relative_path, + relative_to_absolute_path) +from datasets.utils.info_utils import is_small_dataset +from datasets.utils.metadata import MetadataConfigs +from datasets.utils.py_utils import get_imports, map_nested +from datasets.utils.track import tracked_str +from fsspec import filesystem +from fsspec.core import _un_chain +from fsspec.utils import stringify_path +from huggingface_hub import (DatasetCard, DatasetCardData) +from huggingface_hub.hf_api import DatasetInfo as HfDatasetInfo +from huggingface_hub.hf_api import HfApi, RepoFile, RepoFolder +from packaging import version + +from modelscope import HubApi +from modelscope.hub.utils.utils import get_endpoint +from modelscope.msdatasets.utils.hf_file_utils import get_from_cache_ms +from modelscope.utils.config_ds import MS_DATASETS_CACHE +from modelscope.utils.constant import DEFAULT_DATASET_NAMESPACE +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +ExpandDatasetProperty_T = Literal[ + 'author', + 'cardData', + 'citation', + 'createdAt', + 'disabled', + 'description', + 'downloads', + 'downloadsAllTime', + 'gated', + 'lastModified', + 'likes', + 'paperswithcode_id', + 'private', + 'siblings', + 'sha', + 'tags', +] + + +def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> str: + url_or_filename = str(url_or_filename) + # for temp val + revision = None + if url_or_filename.startswith('hf://'): + revision, url_or_filename = url_or_filename.split('@', 1)[-1].split('/', 1) + if is_relative_path(url_or_filename): + # append the relative path to the base_path + # url_or_filename = url_or_path_join(self._base_path, url_or_filename) + revision = revision or 'master' + # Note: make sure the FilePath is the last param + params: dict = {'Source': 'SDK', 'Revision': revision, 'FilePath': url_or_filename} + params: str = urlencode(params) + url_or_filename = self._base_path + params + + out = cached_path(url_or_filename, download_config=download_config) + out = tracked_str(out) + out.set_origin(url_or_filename) + return out + + +def _dataset_info( + self, + repo_id: str, + *, + revision: Optional[str] = None, + timeout: Optional[float] = None, + files_metadata: bool = False, + token: Optional[Union[bool, str]] = None, + expand: Optional[List[ExpandDatasetProperty_T]] = None, +) -> HfDatasetInfo: + """ + Get info on one specific dataset on huggingface.co. + + Dataset can be private if you pass an acceptable token. + + Args: + repo_id (`str`): + A namespace (user or an organization) and a repo name separated + by a `/`. + revision (`str`, *optional*): + The revision of the dataset repository from which to get the + information. + timeout (`float`, *optional*): + Whether to set a timeout for the request to the Hub. + files_metadata (`bool`, *optional*): + Whether or not to retrieve metadata for files in the repository + (size, LFS metadata, etc). Defaults to `False`. + token (`bool` or `str`, *optional*): + A valid authentication token (see https://huggingface.co/settings/token). + If `None` or `True` and machine is logged in (through `huggingface-cli login` + or [`~huggingface_hub.login`]), token will be retrieved from the cache. + If `False`, token is not sent in the request header. + + Returns: + [`hf_api.DatasetInfo`]: The dataset repository information. + + + + Raises the following errors: + + - [`~utils.RepositoryNotFoundError`] + If the repository to download from cannot be found. This may be because it doesn't exist, + or because it is set to `private` and you do not have access. + - [`~utils.RevisionNotFoundError`] + If the revision to download from cannot be found. + + + """ + _api = HubApi() + _namespace, _dataset_name = repo_id.split('/') + dataset_hub_id, dataset_type = _api.get_dataset_id_and_type( + dataset_name=_dataset_name, namespace=_namespace) + + revision: str = revision or 'master' + data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id, + revision=revision, + files_metadata=files_metadata, + timeout=timeout) + + # Parse data + data_d: dict = data['Data'] + data_file_list: list = data_d['Files'] + # commit_info: dict = data_d['LatestCommitter'] + + # Update data # TODO: columns align with HfDatasetInfo + data['id'] = repo_id + data['private'] = False + data['author'] = repo_id.split('/')[0] if repo_id else None + data['sha'] = revision + data['lastModified'] = None + data['gated'] = False + data['disabled'] = False + data['downloads'] = 0 + data['likes'] = 0 + data['tags'] = [] + data['cardData'] = [] + data['createdAt'] = None + + # e.g. {'rfilename': 'xxx', 'blobId': 'xxx', 'size': 0, 'lfs': {'size': 0, 'sha256': 'xxx', 'pointerSize': 0}} + data['siblings'] = [] + for file_info_d in data_file_list: + file_info = { + 'rfilename': file_info_d['Path'], + 'blobId': file_info_d['Id'], + 'size': file_info_d['Size'], + 'type': 'directory' if file_info_d['Type'] == 'tree' else 'file', + 'lfs': { + 'size': file_info_d['Size'], + 'sha256': file_info_d['Sha256'], + 'pointerSize': 0 + } + } + data['siblings'].append(file_info) + + return HfDatasetInfo(**data) + + +def _list_repo_tree( + self, + repo_id: str, + path_in_repo: Optional[str] = None, + *, + recursive: bool = True, + expand: bool = False, + revision: Optional[str] = None, + repo_type: Optional[str] = None, + token: Optional[Union[bool, str]] = None, +) -> Iterable[Union[RepoFile, RepoFolder]]: + + _api = HubApi(timeout=3 * 60, max_retries=3) + + if is_relative_path(repo_id) and repo_id.count('/') == 1: + _namespace, _dataset_name = repo_id.split('/') + elif is_relative_path(repo_id) and repo_id.count('/') == 0: + logger.warning(f'Got a relative path: {repo_id} without namespace, ' + f'Use default namespace: {DEFAULT_DATASET_NAMESPACE}') + _namespace, _dataset_name = DEFAULT_DATASET_NAMESPACE, repo_id + else: + raise ValueError(f'Invalid repo_id: {repo_id} !') + + page_number = 1 + page_size = 100 + while True: + data: dict = _api.list_repo_tree(dataset_name=_dataset_name, + namespace=_namespace, + revision=revision or 'master', + root_path=path_in_repo or None, + recursive=True, + page_number=page_number, + page_size=page_size, + ) + if not ('Code' in data and data['Code'] == 200): + logger.error(f'Get dataset: {repo_id} file list failed, message: {data["Message"]}') + return None + + # Parse data (Type: 'tree' or 'blob') + data_file_list: list = data['Data']['Files'] + + for file_info_d in data_file_list: + path_info = {} + path_info['type'] = 'directory' if file_info_d['Type'] == 'tree' else 'file' + path_info['path'] = file_info_d['Path'] + path_info['size'] = file_info_d['Size'] + path_info['oid'] = file_info_d['Sha256'] + + yield RepoFile(**path_info) if path_info['type'] == 'file' else RepoFolder(**path_info) + + if len(data_file_list) < page_size: + break + page_number += 1 + + +def _get_paths_info( + self, + repo_id: str, + paths: Union[List[str], str], + *, + expand: bool = False, + revision: Optional[str] = None, + repo_type: Optional[str] = None, + token: Optional[Union[bool, str]] = None, +) -> List[Union[RepoFile, RepoFolder]]: + + _api = HubApi() + _namespace, _dataset_name = repo_id.split('/') + dataset_hub_id, dataset_type = _api.get_dataset_id_and_type( + dataset_name=_dataset_name, namespace=_namespace) + + revision: str = revision or 'master' + data = _api.get_dataset_infos(dataset_hub_id=dataset_hub_id, + revision=revision, + files_metadata=False, + recursive='False') + data_d: dict = data['Data'] + data_file_list: list = data_d['Files'] + + return [ + RepoFile(path=item_d['Name'], + size=item_d['Size'], + oid=item_d['Revision'], + lfs=None, # TODO: lfs type to be supported + last_commit=None, # TODO: lfs type to be supported + security=None + ) for item_d in data_file_list if item_d['Name'] == 'README.md' + ] + + +def get_fs_token_paths( + urlpath, + storage_options=None, + protocol=None, +): + if isinstance(urlpath, (list, tuple, set)): + if not urlpath: + raise ValueError('empty urlpath sequence') + urlpath0 = stringify_path(list(urlpath)[0]) + else: + urlpath0 = stringify_path(urlpath) + storage_options = storage_options or {} + if protocol: + storage_options['protocol'] = protocol + chain = _un_chain(urlpath0, storage_options or {}) + inkwargs = {} + # Reverse iterate the chain, creating a nested target_* structure + for i, ch in enumerate(reversed(chain)): + urls, nested_protocol, kw = ch + if i == len(chain) - 1: + inkwargs = dict(**kw, **inkwargs) + continue + inkwargs['target_options'] = dict(**kw, **inkwargs) + inkwargs['target_protocol'] = nested_protocol + inkwargs['fo'] = urls + paths, protocol, _ = chain[0] + fs = filesystem(protocol, **inkwargs) + + return fs + + +def _resolve_pattern( + pattern: str, + base_path: str, + allowed_extensions: Optional[List[str]] = None, + download_config: Optional[DownloadConfig] = None, +) -> List[str]: + """ + Resolve the paths and URLs of the data files from the pattern passed by the user. + + You can use patterns to resolve multiple local files. Here are a few examples: + - *.csv to match all the CSV files at the first level + - **.csv to match all the CSV files at any level + - data/* to match all the files inside "data" + - data/** to match all the files inside "data" and its subdirectories + + The patterns are resolved using the fsspec glob. + + glob.glob, Path.glob, Path.match or fnmatch do not support ** with a prefix/suffix other than a forward slash /. + For instance, this means **.json is the same as *.json. On the contrary, the fsspec glob has no limits regarding the ** prefix/suffix, # noqa: E501 + resulting in **.json being equivalent to **/*.json. + + More generally: + - '*' matches any character except a forward-slash (to match just the file or directory name) + - '**' matches any character including a forward-slash / + + Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested. + The same applies to special directories that start with a double underscore like "__pycache__". + You can still include one if the pattern explicilty mentions it: + - to include a hidden file: "*/.hidden.txt" or "*/.*" + - to include a hidden directory: ".hidden/*" or ".*/*" + - to include a special directory: "__special__/*" or "__*/*" + + Example:: + + >>> from datasets.data_files import resolve_pattern + >>> base_path = "." + >>> resolve_pattern("docs/**/*.py", base_path) + [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py'] + + Args: + pattern (str): Unix pattern or paths or URLs of the data files to resolve. + The paths can be absolute or relative to base_path. + Remote filesystems using fsspec are supported, e.g. with the hf:// protocol. + base_path (str): Base path to use when resolving relative paths. + allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions). + For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"] + Returns: + List[str]: List of paths or URLs to the local or remote files that match the patterns. + """ + if is_relative_path(pattern): + pattern = xjoin(base_path, pattern) + elif is_local_path(pattern): + base_path = os.path.splitdrive(pattern)[0] + os.sep + else: + base_path = '' + # storage_options: {'hf': {'token': None, 'endpoint': 'https://huggingface.co'}} + pattern, storage_options = _prepare_path_and_storage_options( + pattern, download_config=download_config) + fs = get_fs_token_paths(pattern, storage_options=storage_options) + fs_base_path = base_path.split('::')[0].split('://')[-1] or fs.root_marker + fs_pattern = pattern.split('::')[0].split('://')[-1] + files_to_ignore = set(FILES_TO_IGNORE) - {xbasename(pattern)} + protocol = fs.protocol if isinstance(fs.protocol, str) else fs.protocol[0] + protocol_prefix = protocol + '://' if protocol != 'file' else '' + glob_kwargs = {} + if protocol == 'hf' and config.HF_HUB_VERSION >= version.parse('0.20.0'): + # 10 times faster glob with detail=True (ignores costly info like lastCommit) + glob_kwargs['expand_info'] = False + + try: + tmp_file_paths = fs.glob(pattern, detail=True, **glob_kwargs) + except FileNotFoundError: + raise DataFilesNotFoundError(f"Unable to find '{pattern}'") + + matched_paths = [ + filepath if filepath.startswith(protocol_prefix) else protocol_prefix + + filepath for filepath, info in tmp_file_paths.items() + if info['type'] == 'file' and ( + xbasename(filepath) not in files_to_ignore) + and not _is_inside_unrequested_special_dir( + os.path.relpath(filepath, fs_base_path), + os.path.relpath(fs_pattern, fs_base_path)) and # noqa: W504 + not _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir( # noqa: W504 + os.path.relpath(filepath, fs_base_path), + os.path.relpath(fs_pattern, fs_base_path)) + ] # ignore .ipynb and __pycache__, but keep /../ + if allowed_extensions is not None: + out = [ + filepath for filepath in matched_paths + if any('.' + suffix in allowed_extensions + for suffix in xbasename(filepath).split('.')[1:]) + ] + if len(out) < len(matched_paths): + invalid_matched_files = list(set(matched_paths) - set(out)) + logger.info( + f"Some files matched the pattern '{pattern}' but don't have valid data file extensions: " + f'{invalid_matched_files}') + else: + out = matched_paths + if not out: + error_msg = f"Unable to find '{pattern}'" + if allowed_extensions is not None: + error_msg += f' with any supported extension {list(allowed_extensions)}' + raise FileNotFoundError(error_msg) + return out + + +def _get_data_patterns( + base_path: str, + download_config: Optional[DownloadConfig] = None) -> Dict[str, + List[str]]: + """ + Get the default pattern from a directory testing all the supported patterns. + The first patterns to return a non-empty list of data files is returned. + + Some examples of supported patterns: + + Input: + + my_dataset_repository/ + ├── README.md + └── dataset.csv + + Output: + + {"train": ["**"]} + + Input: + + my_dataset_repository/ + ├── README.md + ├── train.csv + └── test.csv + + my_dataset_repository/ + ├── README.md + └── data/ + ├── train.csv + └── test.csv + + my_dataset_repository/ + ├── README.md + ├── train_0.csv + ├── train_1.csv + ├── train_2.csv + ├── train_3.csv + ├── test_0.csv + └── test_1.csv + + Output: + + {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', + 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'], + 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', + 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]} + + Input: + + my_dataset_repository/ + ├── README.md + └── data/ + ├── train/ + │ ├── shard_0.csv + │ ├── shard_1.csv + │ ├── shard_2.csv + │ └── shard_3.csv + └── test/ + ├── shard_0.csv + └── shard_1.csv + + Output: + + {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', + 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'], + 'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', + 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]} + + Input: + + my_dataset_repository/ + ├── README.md + └── data/ + ├── train-00000-of-00003.csv + ├── train-00001-of-00003.csv + ├── train-00002-of-00003.csv + ├── test-00000-of-00001.csv + ├── random-00000-of-00003.csv + ├── random-00001-of-00003.csv + └── random-00002-of-00003.csv + + Output: + + {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'], + 'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'], + 'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']} + + In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS. + """ + resolver = partial( + _resolve_pattern, base_path=base_path, download_config=download_config) + try: + return _get_data_files_patterns(resolver) + except FileNotFoundError: + raise EmptyDatasetError( + f"The directory at {base_path} doesn't contain any data files" + ) from None + + +def get_module_without_script(self) -> DatasetModule: + _ms_api = HubApi() + _repo_id: str = self.name + _namespace, _dataset_name = _repo_id.split('/') + + # hfh_dataset_info = HfApi(config.HF_ENDPOINT).dataset_info( + # self.name, + # revision=self.revision, + # token=self.download_config.token, + # timeout=100.0, + # ) + # even if metadata_configs is not None (which means that we will resolve files for each config later) + # we cannot skip resolving all files because we need to infer module name by files extensions + # revision = hfh_dataset_info.sha # fix the revision in case there are new commits in the meantime + revision = self.revision or 'master' + base_path = f"hf://datasets/{self.name}@{revision}/{self.data_dir or ''}".rstrip( + '/') + + download_config = self.download_config.copy() + if download_config.download_desc is None: + download_config.download_desc = 'Downloading [README.md]' + try: + url_or_filename = _ms_api.get_dataset_file_url( + file_name='README.md', + dataset_name=_dataset_name, + namespace=_namespace, + revision=revision, + extension_filter=False, + ) + + dataset_readme_path = cached_path( + url_or_filename=url_or_filename, download_config=download_config) + dataset_card_data = DatasetCard.load(Path(dataset_readme_path)).data + except FileNotFoundError: + dataset_card_data = DatasetCardData() + + subset_name: str = download_config.storage_options.get('name', None) + + metadata_configs = MetadataConfigs.from_dataset_card_data( + dataset_card_data) + dataset_infos = DatasetInfosDict.from_dataset_card_data(dataset_card_data) + # we need a set of data files to find which dataset builder to use + # because we need to infer module name by files extensions + if self.data_files is not None: + patterns = sanitize_patterns(self.data_files) + elif metadata_configs and 'data_files' in next( + iter(metadata_configs.values())): + + if subset_name is not None: + subset_data_files = metadata_configs[subset_name]['data_files'] + else: + subset_data_files = next(iter(metadata_configs.values()))['data_files'] + patterns = sanitize_patterns(subset_data_files) + else: + patterns = _get_data_patterns( + base_path, download_config=self.download_config) + + data_files = DataFilesDict.from_patterns( + patterns, + base_path=base_path, + allowed_extensions=ALL_ALLOWED_EXTENSIONS, + download_config=self.download_config, + ) + module_name, default_builder_kwargs = infer_module_for_data_files( + data_files=data_files, + path=self.name, + download_config=self.download_config, + ) + data_files = data_files.filter_extensions( + _MODULE_TO_EXTENSIONS[module_name]) + # Collect metadata files if the module supports them + supports_metadata = module_name in _MODULE_SUPPORTS_METADATA + if self.data_files is None and supports_metadata: + try: + metadata_patterns = get_metadata_patterns( + base_path, download_config=self.download_config) + except FileNotFoundError: + metadata_patterns = None + if metadata_patterns is not None: + metadata_data_files_list = DataFilesList.from_patterns( + metadata_patterns, + download_config=self.download_config, + base_path=base_path) + if metadata_data_files_list: + data_files = DataFilesDict({ + split: data_files_list + metadata_data_files_list + for split, data_files_list in data_files.items() + }) + + module_path, _ = _PACKAGED_DATASETS_MODULES[module_name] + + if metadata_configs: + builder_configs, default_config_name = create_builder_configs_from_metadata_configs( + module_path, + metadata_configs, + base_path=base_path, + supports_metadata=supports_metadata, + default_builder_kwargs=default_builder_kwargs, + download_config=self.download_config, + ) + else: + builder_configs: List[BuilderConfig] = [ + import_main_class(module_path).BUILDER_CONFIG_CLASS( + data_files=data_files, + **default_builder_kwargs, + ) + ] + default_config_name = None + builder_kwargs = { + # "base_path": hf_hub_url(self.name, "", revision=revision).rstrip("/"), + 'base_path': + _ms_api.get_file_base_path( + namespace=_namespace, + dataset_name=_dataset_name, + ), + 'repo_id': + self.name, + 'dataset_name': + camelcase_to_snakecase(Path(self.name).name), + 'data_files': data_files, + } + download_config = self.download_config.copy() + if download_config.download_desc is None: + download_config.download_desc = 'Downloading metadata' + + # Note: `dataset_infos.json` is deprecated and can cause an error during loading if it exists + + if default_config_name is None and len(dataset_infos) == 1: + default_config_name = next(iter(dataset_infos)) + + hash = revision + return DatasetModule( + module_path, + hash, + builder_kwargs, + dataset_infos=dataset_infos, + builder_configs_parameters=BuilderConfigsParameters( + metadata_configs=metadata_configs, + builder_configs=builder_configs, + default_config_name=default_config_name, + ), + ) + + +def _download_additional_modules( + name: str, + dataset_name: str, + namespace: str, + revision: str, + imports: Tuple[str, str, str, str], + download_config: Optional[DownloadConfig] +) -> List[Tuple[str, str]]: + """ + Download additional module for a module .py at URL (or local path) /.py + The imports must have been parsed first using ``get_imports``. + + If some modules need to be installed with pip, an error is raised showing how to install them. + This function return the list of downloaded modules as tuples (import_name, module_file_path). + + The downloaded modules can then be moved into an importable directory + with ``_copy_script_and_other_resources_in_importable_dir``. + """ + local_imports = [] + library_imports = [] + download_config = download_config.copy() + if download_config.download_desc is None: + download_config.download_desc = 'Downloading extra modules' + for import_type, import_name, import_path, sub_directory in imports: + if import_type == 'library': + library_imports.append((import_name, import_path)) # Import from a library + continue + + if import_name == name: + raise ValueError( + f'Error in the {name} script, importing relative {import_name} module ' + f'but {import_name} is the name of the script. ' + f"Please change relative import {import_name} to another name and add a '# From: URL_OR_PATH' " + f'comment pointing to the original relative import file path.' + ) + if import_type == 'internal': + _api = HubApi() + # url_or_filename = url_or_path_join(base_path, import_path + ".py") + file_name = import_path + '.py' + url_or_filename = _api.get_dataset_file_url(file_name=file_name, + dataset_name=dataset_name, + namespace=namespace, + revision=revision,) + elif import_type == 'external': + url_or_filename = import_path + else: + raise ValueError('Wrong import_type') + + local_import_path = cached_path( + url_or_filename, + download_config=download_config, + ) + if sub_directory is not None: + local_import_path = os.path.join(local_import_path, sub_directory) + local_imports.append((import_name, local_import_path)) + + # Check library imports + needs_to_be_installed = {} + for library_import_name, library_import_path in library_imports: + try: + lib = importlib.import_module(library_import_name) # noqa F841 + except ImportError: + if library_import_name not in needs_to_be_installed or library_import_path != library_import_name: + needs_to_be_installed[library_import_name] = library_import_path + if needs_to_be_installed: + _dependencies_str = 'dependencies' if len(needs_to_be_installed) > 1 else 'dependency' + _them_str = 'them' if len(needs_to_be_installed) > 1 else 'it' + if 'sklearn' in needs_to_be_installed.keys(): + needs_to_be_installed['sklearn'] = 'scikit-learn' + if 'Bio' in needs_to_be_installed.keys(): + needs_to_be_installed['Bio'] = 'biopython' + raise ImportError( + f'To be able to use {name}, you need to install the following {_dependencies_str}: ' + f"{', '.join(needs_to_be_installed)}.\nPlease install {_them_str} using 'pip install " + f"{' '.join(needs_to_be_installed.values())}' for instance." + ) + return local_imports + + +def get_module_with_script(self) -> DatasetModule: + + _api = HubApi() + _dataset_name: str = self.name.split('/')[-1] + _namespace: str = self.name.split('/')[0] + + script_file_name = f'{_dataset_name}.py' + script_url: str = _api.get_dataset_file_url( + file_name=script_file_name, + dataset_name=_dataset_name, + namespace=_namespace, + revision=self.revision, + extension_filter=False, + ) + local_script_path = cached_path( + url_or_filename=script_url, download_config=self.download_config) + + dataset_infos_path = None + # try: + # dataset_infos_url: str = _api.get_dataset_file_url( + # file_name='dataset_infos.json', + # dataset_name=_dataset_name, + # namespace=_namespace, + # revision=self.revision, + # extension_filter=False, + # ) + # dataset_infos_path = cached_path( + # url_or_filename=dataset_infos_url, download_config=self.download_config) + # except Exception as e: + # logger.info(f'Cannot find dataset_infos.json: {e}') + # dataset_infos_path = None + + dataset_readme_url: str = _api.get_dataset_file_url( + file_name='README.md', + dataset_name=_dataset_name, + namespace=_namespace, + revision=self.revision, + extension_filter=False, + ) + dataset_readme_path = cached_path( + url_or_filename=dataset_readme_url, download_config=self.download_config) + + imports = get_imports(local_script_path) + local_imports = _download_additional_modules( + name=self.name, + dataset_name=_dataset_name, + namespace=_namespace, + revision=self.revision, + imports=imports, + download_config=self.download_config, + ) + additional_files = [] + if dataset_infos_path: + additional_files.append((config.DATASETDICT_INFOS_FILENAME, dataset_infos_path)) + if dataset_readme_path: + additional_files.append((config.REPOCARD_FILENAME, dataset_readme_path)) + # copy the script and the files in an importable directory + dynamic_modules_path = self.dynamic_modules_path if self.dynamic_modules_path else init_dynamic_modules() + hash = files_to_hash([local_script_path] + [loc[1] for loc in local_imports]) + importable_file_path = _get_importable_file_path( + dynamic_modules_path=dynamic_modules_path, + module_namespace='datasets', + subdirectory_name=hash, + name=self.name, + ) + if not os.path.exists(importable_file_path): + trust_remote_code = resolve_trust_remote_code(trust_remote_code=self.trust_remote_code, repo_id=self.name) + if trust_remote_code: + _create_importable_file( + local_path=local_script_path, + local_imports=local_imports, + additional_files=additional_files, + dynamic_modules_path=dynamic_modules_path, + module_namespace='datasets', + subdirectory_name=hash, + name=self.name, + download_mode=self.download_mode, + ) + else: + raise ValueError( + f'Loading {self.name} requires you to execute the dataset script in that' + ' repo on your local machine. Make sure you have read the code there to avoid malicious use, then' + ' set the option `trust_remote_code=True` to remove this error.' + ) + module_path, hash = _load_importable_file( + dynamic_modules_path=dynamic_modules_path, + module_namespace='datasets', + subdirectory_name=hash, + name=self.name, + ) + # make the new module to be noticed by the import system + importlib.invalidate_caches() + builder_kwargs = { + # "base_path": hf_hub_url(self.name, "", revision=self.revision).rstrip("/"), + 'base_path': _api.get_file_base_path(namespace=_namespace, dataset_name=_dataset_name), + 'repo_id': self.name, + } + return DatasetModule(module_path, hash, builder_kwargs) + + +class DatasetsWrapperHF: + + @staticmethod + def load_dataset( + path: str, + name: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], + Mapping[str, Union[str, + Sequence[str]]]]] = None, + split: Optional[Union[str, Split]] = None, + cache_dir: Optional[str] = None, + features: Optional[Features] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + verification_mode: Optional[Union[VerificationMode, str]] = None, + keep_in_memory: Optional[bool] = None, + save_infos: bool = False, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + use_auth_token='deprecated', + task='deprecated', + streaming: bool = False, + num_proc: Optional[int] = None, + storage_options: Optional[Dict] = None, + trust_remote_code: bool = True, + dataset_info_only: Optional[bool] = False, + **config_kwargs, + ) -> Union[DatasetDict, Dataset, IterableDatasetDict, IterableDataset, + dict]: + + if use_auth_token != 'deprecated': + warnings.warn( + "'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n" + "You can remove this warning by passing 'token=' instead.", + FutureWarning, + ) + token = use_auth_token + if task != 'deprecated': + warnings.warn( + "'task' was deprecated in version 2.13.0 and will be removed in 3.0.0.\n", + FutureWarning, + ) + else: + task = None + if data_files is not None and not data_files: + raise ValueError( + f"Empty 'data_files': '{data_files}'. It should be either non-empty or None (default)." + ) + if Path(path, config.DATASET_STATE_JSON_FILENAME).exists( + ): + raise ValueError( + 'You are trying to load a dataset that was saved using `save_to_disk`. ' + 'Please use `load_from_disk` instead.') + + if streaming and num_proc is not None: + raise NotImplementedError( + 'Loading a streaming dataset in parallel with `num_proc` is not implemented. ' + 'To parallelize streaming, you can wrap the dataset with a PyTorch DataLoader ' + 'using `num_workers` > 1 instead.') + + download_mode = DownloadMode(download_mode + or DownloadMode.REUSE_DATASET_IF_EXISTS) + verification_mode = VerificationMode(( + verification_mode or VerificationMode.BASIC_CHECKS + ) if not save_infos else VerificationMode.ALL_CHECKS) + + # Create a dataset builder + builder_instance = DatasetsWrapperHF.load_dataset_builder( + path=path, + name=name, + data_dir=data_dir, + data_files=data_files, + cache_dir=cache_dir, + features=features, + download_config=download_config, + download_mode=download_mode, + revision=revision, + token=token, + storage_options=storage_options, + trust_remote_code=trust_remote_code, + _require_default_config_name=name is None, + **config_kwargs, + ) + + # Note: Only for preview mode + if dataset_info_only: + ret_dict = {} + # Get dataset config info from python script + if isinstance(path, str) and path.endswith('.py') and os.path.exists(path): + from datasets import get_dataset_config_names + subset_list = get_dataset_config_names(path) + ret_dict = {_subset: [] for _subset in subset_list} + return ret_dict + + if builder_instance is None or not hasattr(builder_instance, + 'builder_configs'): + logger.error(f'No builder_configs found for {path} dataset.') + return ret_dict + + _tmp_builder_configs = builder_instance.builder_configs + for tmp_config_name, tmp_builder_config in _tmp_builder_configs.items(): + tmp_config_name = str(tmp_config_name) + if hasattr(tmp_builder_config, 'data_files') and tmp_builder_config.data_files is not None: + ret_dict[tmp_config_name] = [str(item) for item in list(tmp_builder_config.data_files.keys())] + else: + ret_dict[tmp_config_name] = [] + return ret_dict + + # Return iterable dataset in case of streaming + if streaming: + return builder_instance.as_streaming_dataset(split=split) + + # Some datasets are already processed on the HF google storage + # Don't try downloading from Google storage for the packaged datasets as text, json, csv or pandas + # try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES + + # Download and prepare data + builder_instance.download_and_prepare( + download_config=download_config, + download_mode=download_mode, + verification_mode=verification_mode, + num_proc=num_proc, + storage_options=storage_options, + # base_path=builder_instance.base_path, + # file_format=builder_instance.name or 'arrow', + ) + + # Build dataset for splits + keep_in_memory = ( + keep_in_memory if keep_in_memory is not None else is_small_dataset( + builder_instance.info.dataset_size)) + ds = builder_instance.as_dataset( + split=split, + verification_mode=verification_mode, + in_memory=keep_in_memory) + # Rename and cast features to match task schema + if task is not None: + # To avoid issuing the same warning twice + with warnings.catch_warnings(): + warnings.simplefilter('ignore', FutureWarning) + ds = ds.prepare_for_task(task) + if save_infos: + builder_instance._save_infos() + + try: + _api = HubApi() + if is_relative_path(path) and path.count('/') == 1: + _namespace, _dataset_name = path.split('/') + _api.dataset_download_statistics(dataset_name=_dataset_name, namespace=_namespace) + except Exception as e: + logger.warning(f'Could not record download statistics: {e}') + + return ds + + @staticmethod + def load_dataset_builder( + path: str, + name: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[str, Sequence[str], + Mapping[str, Union[str, + Sequence[str]]]]] = None, + cache_dir: Optional[str] = None, + features: Optional[Features] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + revision: Optional[Union[str, Version]] = None, + token: Optional[Union[bool, str]] = None, + use_auth_token='deprecated', + storage_options: Optional[Dict] = None, + trust_remote_code: Optional[bool] = None, + _require_default_config_name=True, + **config_kwargs, + ) -> DatasetBuilder: + + if use_auth_token != 'deprecated': + warnings.warn( + "'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n" + "You can remove this warning by passing 'token=' instead.", + FutureWarning, + ) + token = use_auth_token + download_mode = DownloadMode(download_mode + or DownloadMode.REUSE_DATASET_IF_EXISTS) + if token is not None: + download_config = download_config.copy( + ) if download_config else DownloadConfig() + download_config.token = token + if storage_options is not None: + download_config = download_config.copy( + ) if download_config else DownloadConfig() + download_config.storage_options.update(storage_options) + + dataset_module = DatasetsWrapperHF.dataset_module_factory( + path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + data_dir=data_dir, + data_files=data_files, + cache_dir=cache_dir, + trust_remote_code=trust_remote_code, + _require_default_config_name=_require_default_config_name, + _require_custom_configs=bool(config_kwargs), + name=name, + ) + # Get dataset builder class from the processing script + builder_kwargs = dataset_module.builder_kwargs + data_dir = builder_kwargs.pop('data_dir', data_dir) + data_files = builder_kwargs.pop('data_files', data_files) + config_name = builder_kwargs.pop( + 'config_name', name + or dataset_module.builder_configs_parameters.default_config_name) + dataset_name = builder_kwargs.pop('dataset_name', None) + info = dataset_module.dataset_infos.get( + config_name) if dataset_module.dataset_infos else None + + if (path in _PACKAGED_DATASETS_MODULES and data_files is None + and dataset_module.builder_configs_parameters. + builder_configs[0].data_files is None): + error_msg = f'Please specify the data files or data directory to load for the {path} dataset builder.' + example_extensions = [ + extension for extension in _EXTENSION_TO_MODULE + if _EXTENSION_TO_MODULE[extension] == path + ] + if example_extensions: + error_msg += f'\nFor example `data_files={{"train": "path/to/data/train/*.{example_extensions[0]}"}}`' + raise ValueError(error_msg) + + builder_cls = get_dataset_builder_class( + dataset_module, dataset_name=dataset_name) + + builder_instance: DatasetBuilder = builder_cls( + cache_dir=cache_dir, + dataset_name=dataset_name, + config_name=config_name, + data_dir=data_dir, + data_files=data_files, + hash=dataset_module.hash, + info=info, + features=features, + token=token, + storage_options=storage_options, + **builder_kwargs, # contains base_path + **config_kwargs, + ) + builder_instance._use_legacy_cache_dir_if_possible(dataset_module) + + return builder_instance + + @staticmethod + def dataset_module_factory( + path: str, + revision: Optional[Union[str, Version]] = None, + download_config: Optional[DownloadConfig] = None, + download_mode: Optional[Union[DownloadMode, str]] = None, + dynamic_modules_path: Optional[str] = None, + data_dir: Optional[str] = None, + data_files: Optional[Union[Dict, List, str, DataFilesDict]] = None, + cache_dir: Optional[str] = None, + trust_remote_code: Optional[bool] = None, + _require_default_config_name=True, + _require_custom_configs=False, + **download_kwargs, + ) -> DatasetModule: + + subset_name: str = download_kwargs.pop('name', None) + if download_config is None: + download_config = DownloadConfig(**download_kwargs) + download_config.storage_options.update({'name': subset_name}) + + if download_config and download_config.cache_dir is None: + download_config.cache_dir = MS_DATASETS_CACHE + + download_mode = DownloadMode(download_mode + or DownloadMode.REUSE_DATASET_IF_EXISTS) + download_config.extract_compressed_file = True + download_config.force_extract = True + download_config.force_download = download_mode == DownloadMode.FORCE_REDOWNLOAD + + filename = list( + filter(lambda x: x, + path.replace(os.sep, '/').split('/')))[-1] + if not filename.endswith('.py'): + filename = filename + '.py' + combined_path = os.path.join(path, filename) + + # We have several ways to get a dataset builder: + # + # - if path is the name of a packaged dataset module + # -> use the packaged module (json, csv, etc.) + # + # - if os.path.join(path, name) is a local python file + # -> use the module from the python file + # - if path is a local directory (but no python file) + # -> use a packaged module (csv, text etc.) based on content of the directory + # + # - if path has one "/" and is dataset repository on the HF hub with a python file + # -> the module from the python file in the dataset repository + # - if path has one "/" and is dataset repository on the HF hub without a python file + # -> use a packaged module (csv, text etc.) based on content of the repository + + # Try packaged + if path in _PACKAGED_DATASETS_MODULES: + return PackagedDatasetModuleFactory( + path, + data_dir=data_dir, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + ).get_module() + # Try locally + elif path.endswith(filename): + if os.path.isfile(path): + return LocalDatasetModuleFactoryWithScript( + path, + download_mode=download_mode, + dynamic_modules_path=dynamic_modules_path, + trust_remote_code=trust_remote_code, + ).get_module() + else: + raise FileNotFoundError( + f"Couldn't find a dataset script at {relative_to_absolute_path(path)}" + ) + elif os.path.isfile(combined_path): + return LocalDatasetModuleFactoryWithScript( + combined_path, + download_mode=download_mode, + dynamic_modules_path=dynamic_modules_path, + trust_remote_code=trust_remote_code, + ).get_module() + elif os.path.isdir(path): + return LocalDatasetModuleFactoryWithoutScript( + path, + data_dir=data_dir, + data_files=data_files, + download_mode=download_mode).get_module() + # Try remotely + elif is_relative_path(path) and path.count('/') <= 1: + try: + _raise_if_offline_mode_is_enabled() + + try: + dataset_info = HfApi().dataset_info( + repo_id=path, + revision=revision, + token=download_config.token, + timeout=100.0, + ) + except Exception as e: # noqa catch any exception of hf_hub and consider that the dataset doesn't exist + if isinstance( + e, + ( # noqa: E131 + OfflineModeIsEnabled, # noqa: E131 + requests.exceptions. + ConnectTimeout, # noqa: E131, E261 + requests.exceptions.ConnectionError, # noqa: E131 + ), # noqa: E131 + ): + raise ConnectionError( + f"Couldn't reach '{path}' on the Hub ({type(e).__name__})" + ) + elif '404' in str(e): + msg = f"Dataset '{path}' doesn't exist on the Hub" + raise DatasetNotFoundError( + msg + + f" at revision '{revision}'" if revision else msg + ) + elif '401' in str(e): + msg = f"Dataset '{path}' doesn't exist on the Hub" + msg = msg + f" at revision '{revision}'" if revision else msg + raise DatasetNotFoundError( + msg + '. If the repo is private or gated, ' + 'make sure to log in with `huggingface-cli login`.' + ) + else: + raise e + if filename in [ + sibling.rfilename for sibling in dataset_info.siblings + ]: # contains a dataset script + + # fs = HfFileSystem( + # endpoint=config.HF_ENDPOINT, + # token=download_config.token) + + # TODO + can_load_config_from_parquet_export = False + # if _require_custom_configs: + # can_load_config_from_parquet_export = False + # elif _require_default_config_name: + # with fs.open( + # f'datasets/{path}/{filename}', + # 'r', + # revision=revision, + # encoding='utf-8') as f: + # can_load_config_from_parquet_export = 'DEFAULT_CONFIG_NAME' not in f.read( + # ) + # else: + # can_load_config_from_parquet_export = True + if config.USE_PARQUET_EXPORT and can_load_config_from_parquet_export: + # If the parquet export is ready (parquet files + info available for the current sha), + # we can use it instead + # This fails when the dataset has multiple configs and a default config and + # the user didn't specify a configuration name (_require_default_config_name=True). + try: + return HubDatasetModuleFactoryWithParquetExport( + path, + download_config=download_config, + revision=dataset_info.sha).get_module() + except Exception as e: + logger.error(e) + + # Otherwise we must use the dataset script if the user trusts it + return HubDatasetModuleFactoryWithScript( + path, + revision=revision, + download_config=download_config, + download_mode=download_mode, + dynamic_modules_path=dynamic_modules_path, + trust_remote_code=trust_remote_code, + ).get_module() + else: + return HubDatasetModuleFactoryWithoutScript( + path, + revision=revision, + data_dir=data_dir, + data_files=data_files, + download_config=download_config, + download_mode=download_mode, + ).get_module() + except Exception as e1: + # All the attempts failed, before raising the error we should check if the module is already cached + logger.error(f'>> Error loading {path}: {e1}') + try: + return CachedDatasetModuleFactory( + path, + dynamic_modules_path=dynamic_modules_path, + cache_dir=cache_dir).get_module() + except Exception: + # If it's not in the cache, then it doesn't exist. + if isinstance(e1, OfflineModeIsEnabled): + raise ConnectionError( + f"Couldn't reach the Hugging Face Hub for dataset '{path}': {e1}" + ) from None + if isinstance(e1, + (DataFilesNotFoundError, + DatasetNotFoundError, EmptyDatasetError)): + raise e1 from None + if isinstance(e1, FileNotFoundError): + raise FileNotFoundError( + f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or " + f'any data file in the same directory. ' + f"Couldn't find '{path}' on the Hugging Face Hub either: {type(e1).__name__}: {e1}" + ) from None + raise e1 from None + else: + raise FileNotFoundError( + f"Couldn't find a dataset script at {relative_to_absolute_path(combined_path)} or " + f'any data file in the same directory.') + + +@contextlib.contextmanager +def load_dataset_with_ctx(*args, **kwargs): + + # Keep the original functions + hf_endpoint_origin = config.HF_ENDPOINT + get_from_cache_origin = file_utils.get_from_cache + + # Compatible with datasets 2.18.0 + _download_origin = DownloadManager._download if hasattr(DownloadManager, '_download') \ + else DownloadManager._download_single + + dataset_info_origin = HfApi.dataset_info + list_repo_tree_origin = HfApi.list_repo_tree + get_paths_info_origin = HfApi.get_paths_info + resolve_pattern_origin = data_files.resolve_pattern + get_module_without_script_origin = HubDatasetModuleFactoryWithoutScript.get_module + get_module_with_script_origin = HubDatasetModuleFactoryWithScript.get_module + + # Monkey patching with modelscope functions + config.HF_ENDPOINT = get_endpoint() + file_utils.get_from_cache = get_from_cache_ms + # Compatible with datasets 2.18.0 + if hasattr(DownloadManager, '_download'): + DownloadManager._download = _download_ms + else: + DownloadManager._download_single = _download_ms + HfApi.dataset_info = _dataset_info + HfApi.list_repo_tree = _list_repo_tree + HfApi.get_paths_info = _get_paths_info + data_files.resolve_pattern = _resolve_pattern + HubDatasetModuleFactoryWithoutScript.get_module = get_module_without_script + HubDatasetModuleFactoryWithScript.get_module = get_module_with_script + + streaming = kwargs.get('streaming', False) + + try: + dataset_res = DatasetsWrapperHF.load_dataset(*args, **kwargs) + yield dataset_res + finally: + # Restore the original functions + config.HF_ENDPOINT = hf_endpoint_origin + file_utils.get_from_cache = get_from_cache_origin + # Keep the context during the streaming iteration + if not streaming: + config.HF_ENDPOINT = hf_endpoint_origin + file_utils.get_from_cache = get_from_cache_origin + + # Compatible with datasets 2.18.0 + if hasattr(DownloadManager, '_download'): + DownloadManager._download = _download_origin + else: + DownloadManager._download_single = _download_origin + + HfApi.dataset_info = dataset_info_origin + HfApi.list_repo_tree = list_repo_tree_origin + HfApi.get_paths_info = get_paths_info_origin + data_files.resolve_pattern = resolve_pattern_origin + HubDatasetModuleFactoryWithoutScript.get_module = get_module_without_script_origin + HubDatasetModuleFactoryWithScript.get_module = get_module_with_script_origin diff --git a/modelscope/msdatasets/utils/hf_file_utils.py b/modelscope/msdatasets/utils/hf_file_utils.py new file mode 100644 index 000000000..863bb1960 --- /dev/null +++ b/modelscope/msdatasets/utils/hf_file_utils.py @@ -0,0 +1,346 @@ +# noqa: isort:skip_file, yapf: disable +# Copyright (c) Alibaba, Inc. and its affiliates. +# Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. + +import json +import os +import re +import copy +import shutil +import time +import warnings +from contextlib import contextmanager +from functools import partial +from pathlib import Path +from typing import Optional, Union +from urllib.parse import urljoin, urlparse +import requests +from tqdm import tqdm + +from datasets import config +from datasets.utils.file_utils import hash_url_to_filename, \ + get_authentication_headers_for_url, fsspec_head, fsspec_get +from filelock import FileLock + +from modelscope.utils.config_ds import MS_DATASETS_CACHE +from modelscope.utils.logger import get_logger +from modelscope.hub.api import ModelScopeConfig + +from modelscope import __version__ + +logger = get_logger() + + +def get_datasets_user_agent_ms(user_agent: Optional[Union[str, dict]] = None) -> str: + ua = f'datasets/{__version__}' + ua += f'; python/{config.PY_VERSION}' + ua += f'; pyarrow/{config.PYARROW_VERSION}' + if config.TORCH_AVAILABLE: + ua += f'; torch/{config.TORCH_VERSION}' + if config.TF_AVAILABLE: + ua += f'; tensorflow/{config.TF_VERSION}' + if config.JAX_AVAILABLE: + ua += f'; jax/{config.JAX_VERSION}' + if isinstance(user_agent, dict): + ua += f"; {'; '.join(f'{k}/{v}' for k, v in user_agent.items())}" + elif isinstance(user_agent, str): + ua += '; ' + user_agent + return ua + + +def _request_with_retry_ms( + method: str, + url: str, + max_retries: int = 2, + base_wait_time: float = 0.5, + max_wait_time: float = 2, + timeout: float = 10.0, + **params, +) -> requests.Response: + """Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff. + + Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised. + + Args: + method (str): HTTP method, such as 'GET' or 'HEAD'. + url (str): The URL of the resource to fetch. + max_retries (int): Maximum number of retries, defaults to 0 (no retries). + base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between + retries then grows exponentially, capped by max_wait_time. + max_wait_time (float): Maximum amount of time between two retries, in seconds. + **params (additional keyword arguments): Params to pass to :obj:`requests.request`. + """ + tries, success = 0, False + response = None + while not success: + tries += 1 + try: + response = requests.request(method=method.upper(), url=url, timeout=timeout, **params) + success = True + except (requests.exceptions.ConnectTimeout, requests.exceptions.ConnectionError) as err: + if tries > max_retries: + raise err + else: + logger.info(f'{method} request to {url} timed out, retrying... [{tries/max_retries}]') + sleep_time = min(max_wait_time, base_wait_time * 2 ** (tries - 1)) # Exponential backoff + time.sleep(sleep_time) + return response + + +def http_head_ms( + url, proxies=None, headers=None, cookies=None, allow_redirects=True, timeout=10.0, max_retries=0 +) -> requests.Response: + headers = copy.deepcopy(headers) or {} + headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent')) + response = _request_with_retry_ms( + method='HEAD', + url=url, + proxies=proxies, + headers=headers, + cookies=cookies, + allow_redirects=allow_redirects, + timeout=timeout, + max_retries=max_retries, + ) + return response + + +def http_get_ms( + url, temp_file, proxies=None, resume_size=0, headers=None, cookies=None, timeout=100.0, max_retries=0, desc=None +) -> Optional[requests.Response]: + headers = dict(headers) if headers is not None else {} + headers['user-agent'] = get_datasets_user_agent_ms(user_agent=headers.get('user-agent')) + if resume_size > 0: + headers['Range'] = f'bytes={resume_size:d}-' + response = _request_with_retry_ms( + method='GET', + url=url, + stream=True, + proxies=proxies, + headers=headers, + cookies=cookies, + max_retries=max_retries, + timeout=timeout, + ) + if temp_file is None: + return response + if response.status_code == 416: # Range not satisfiable + return + content_length = response.headers.get('Content-Length') + total = resume_size + int(content_length) if content_length is not None else None + + progress = tqdm(total=total, initial=resume_size, unit_scale=True, unit='B', desc=desc or 'Downloading') + for chunk in response.iter_content(chunk_size=1024): + progress.update(len(chunk)) + temp_file.write(chunk) + + progress.close() + + +def get_from_cache_ms( + url, + cache_dir=None, + force_download=False, + proxies=None, + etag_timeout=100, + resume_download=False, + user_agent=None, + local_files_only=False, + use_etag=True, + max_retries=0, + token=None, + use_auth_token='deprecated', + ignore_url_params=False, + storage_options=None, + download_desc=None, + disable_tqdm=None, +) -> str: + """ + Given a URL, look for the corresponding file in the local cache. + If it's not there, download it. Then return the path to the cached file. + + Return: + Local path (string) + + Raises: + FileNotFoundError: in case of non-recoverable file + (non-existent or no cache on disk) + ConnectionError: in case of unreachable url + and no cache on disk + """ + if use_auth_token != 'deprecated': + warnings.warn( + "'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.\n" + f"You can remove this warning by passing 'token={use_auth_token}' instead.", + FutureWarning, + ) + token = use_auth_token + if cache_dir is None: + cache_dir = MS_DATASETS_CACHE + if isinstance(cache_dir, Path): + cache_dir = str(cache_dir) + + os.makedirs(cache_dir, exist_ok=True) + + if ignore_url_params: + # strip all query parameters and #fragments from the URL + cached_url = urljoin(url, urlparse(url).path) + else: + cached_url = url # additional parameters may be added to the given URL + + connected = False + response = None + cookies = None + etag = None + head_error = None + scheme = None + + # Try a first time to file the file on the local file system without eTag (None) + # if we don't ask for 'force_download' then we spare a request + filename = hash_url_to_filename(cached_url, etag=None) + cache_path = os.path.join(cache_dir, filename) + if download_desc is None: + download_desc = 'Downloading [' + filename + ']' + + if os.path.exists(cache_path) and not force_download and not use_etag: + return cache_path + + # Prepare headers for authentication + headers = get_authentication_headers_for_url(url, token=token) + if user_agent is not None: + headers['user-agent'] = user_agent + + # We don't have the file locally or we need an eTag + if not local_files_only: + scheme = urlparse(url).scheme + if scheme not in ('http', 'https'): + response = fsspec_head(url, storage_options=storage_options) + # s3fs uses "ETag", gcsfs uses "etag" + etag = (response.get('ETag', None) or response.get('etag', None)) if use_etag else None + connected = True + try: + cookies = ModelScopeConfig.get_cookies() + response = http_head_ms( + url, + allow_redirects=True, + proxies=proxies, + timeout=etag_timeout, + max_retries=max_retries, + headers=headers, + cookies=cookies, + ) + if response.status_code == 200: # ok + etag = response.headers.get('ETag') if use_etag else None + for k, v in response.cookies.items(): + # In some edge cases, we need to get a confirmation token + if k.startswith('download_warning') and 'drive.google.com' in url: + url += '&confirm=' + v + cookies = response.cookies + connected = True + # Fix Google Drive URL to avoid Virus scan warning + if 'drive.google.com' in url and 'confirm=' not in url: + url += '&confirm=t' + # In some edge cases, head request returns 400 but the connection is actually ok + elif ( + (response.status_code == 400 and 'firebasestorage.googleapis.com' in url) + or (response.status_code == 405 and 'drive.google.com' in url) + or ( + response.status_code == 403 + and ( + re.match(r'^https?://github.com/.*?/.*?/releases/download/.*?/.*?$', url) + or re.match(r'^https://.*?s3.*?amazonaws.com/.*?$', response.url) + ) + ) + or (response.status_code == 403 and 'ndownloader.figstatic.com' in url) + ): + connected = True + logger.info(f"Couldn't get ETag version for url {url}") + elif response.status_code == 401 and config.HF_ENDPOINT in url and token is None: + raise ConnectionError( + f'Unauthorized for URL {url}. ' + f'Please use the parameter `token=True` after logging in with `huggingface-cli login`' + ) + except (OSError, requests.exceptions.Timeout) as e: + # not connected + head_error = e + pass + + # connected == False = we don't have a connection, or url doesn't exist, or is otherwise inaccessible. + # try to get the last downloaded one + if not connected: + if os.path.exists(cache_path) and not force_download: + return cache_path + if local_files_only: + raise FileNotFoundError( + f'Cannot find the requested files in the cached path at {cache_path} and outgoing traffic has been' + " disabled. To enable file online look-ups, set 'local_files_only' to False." + ) + elif response is not None and response.status_code == 404: + raise FileNotFoundError(f"Couldn't find file at {url}") + if head_error is not None: + raise ConnectionError(f"Couldn't reach {url} ({repr(head_error)})") + elif response is not None: + raise ConnectionError(f"Couldn't reach {url} (error {response.status_code})") + else: + raise ConnectionError(f"Couldn't reach {url}") + + # Try a second time + filename = hash_url_to_filename(cached_url, etag) + cache_path = os.path.join(cache_dir, filename) + + if os.path.exists(cache_path) and not force_download: + return cache_path + + # From now on, connected is True. + # Prevent parallel downloads of the same file with a lock. + lock_path = cache_path + '.lock' + with FileLock(lock_path): + # Retry in case previously locked processes just enter after the precedent process releases the lock + if os.path.exists(cache_path) and not force_download: + return cache_path + + incomplete_path = cache_path + '.incomplete' + + @contextmanager + def temp_file_manager(mode='w+b'): + with open(incomplete_path, mode) as f: + yield f + + resume_size = 0 + if resume_download: + temp_file_manager = partial(temp_file_manager, mode='a+b') + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size + + # Download to temporary file, then copy to cache path once finished. + # Otherwise, you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + + # GET file object + if scheme not in ('http', 'https'): + fsspec_get(url, temp_file, storage_options=storage_options, desc=download_desc) + else: + http_get_ms( + url, + temp_file=temp_file, + proxies=proxies, + resume_size=resume_size, + headers=headers, + cookies=cookies, + max_retries=max_retries, + desc=download_desc, + ) + + logger.info(f'storing {url} in cache at {cache_path}') + shutil.move(temp_file.name, cache_path) + umask = os.umask(0o666) + os.umask(umask) + os.chmod(cache_path, 0o666 & ~umask) + + logger.info(f'creating metadata file for {cache_path}') + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w', encoding='utf-8') as meta_file: + json.dump(meta, meta_file) + + return cache_path diff --git a/modelscope/outputs/nlp_outputs.py b/modelscope/outputs/nlp_outputs.py index ed42cb5a8..747f5bd3d 100644 --- a/modelscope/outputs/nlp_outputs.py +++ b/modelscope/outputs/nlp_outputs.py @@ -326,7 +326,7 @@ class TextErrorCorrectionOutput(ModelOutputBase): """The output class for information extraction models. """ - predictions: np.ndarray = None + predictions: List = None @dataclass diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py index a32fc157d..4db9c0bac 100644 --- a/modelscope/outputs/outputs.py +++ b/modelscope/outputs/outputs.py @@ -25,6 +25,10 @@ class OutputKeys(object): MASKS = 'masks' DEPTHS = 'depths' DEPTHS_COLOR = 'depths_color' + FLOWS = 'flows' + FLOWS_COLOR = 'flows_color' + NORMALS = 'normals' + NORMALS_COLOR = 'normals_color' LAYOUT = 'layout' TEXT = 'text' POLYGONS = 'polygons' @@ -69,6 +73,7 @@ class OutputKeys(object): PCD12 = 'pcd12' PCD12_ALIGN = 'pcd12_align' TBOUNDS = 'tbounds' + MV_IMGS = 'MViews' OutputTypes = { @@ -132,6 +137,7 @@ class OutputKeys(object): OutputKeys.PCD12: np.ndarray, OutputKeys.PCD12_ALIGN: np.ndarray, OutputKeys.TBOUNDS: Dict, + OutputKeys.MV_IMGS: List[np.ndarray], } OutputTypeSchema = { @@ -426,6 +432,15 @@ class OutputKeys(object): OutputKeys.TBOUNDS: { 'type': 'object' }, + OutputKeys.MV_IMGS: { + 'type': 'array', + 'items': { + 'type': 'array', + 'items': { + 'type': 'number' + } + } + }, } TASK_OUTPUTS = { @@ -761,6 +776,7 @@ class OutputKeys(object): Tasks.surface_recon_common: [OutputKeys.OUTPUT], Tasks.video_colorization: [OutputKeys.OUTPUT_VIDEO], Tasks.image_control_3d_portrait: [OutputKeys.OUTPUT], + Tasks.self_supervised_depth_completion: [OutputKeys.OUTPUT_IMG], # image quality assessment degradation result for single image # { @@ -1632,6 +1648,8 @@ class OutputKeys(object): # "output_imgs": np.ndarray list with shape [[height, width, 3], ...] # } Tasks.image_view_transform: [OutputKeys.OUTPUT_IMGS], + Tasks.image_to_3d: [OutputKeys.MV_IMGS], + Tasks.siamese_uie: [OutputKeys.OUTPUT], } diff --git a/modelscope/pipeline_inputs.py b/modelscope/pipeline_inputs.py index 6e6443765..f281d0e70 100644 --- a/modelscope/pipeline_inputs.py +++ b/modelscope/pipeline_inputs.py @@ -247,8 +247,10 @@ def check_input_type(input_type, input): InputType.VIDEO, # image generation task result for a single image - Tasks.image_to_image_generation: - InputType.IMAGE, + Tasks.image_to_image_generation: [ + InputType.IMAGE, + (InputType.IMAGE, InputType.IMAGE, InputType.IMAGE, InputType.IMAGE) + ], Tasks.image_to_image_translation: InputType.IMAGE, Tasks.image_style_transfer: { @@ -436,6 +438,8 @@ def check_input_type(input_type, input): Tasks.machine_reading_comprehension: InputType.TEXT, + Tasks.siamese_uie: InputType.TEXT, + # ============ audio tasks =================== Tasks.auto_speech_recognition: # input can be audio, or audio and text. [InputType.AUDIO, { diff --git a/modelscope/pipelines/accelerate/vllm.py b/modelscope/pipelines/accelerate/vllm.py index 5c11c29b0..15ced4bb6 100644 --- a/modelscope/pipelines/accelerate/vllm.py +++ b/modelscope/pipelines/accelerate/vllm.py @@ -42,6 +42,24 @@ def __call__(self, prompts: Union[List[str], List[List[int]]], The string batch or the token list batch to input to the model. kwargs: Sampling parameters. """ + + # convert hf generate config to vllm + do_sample = kwargs.pop('do_sample', None) + num_beam = kwargs.pop('num_beam', 1) + max_length = kwargs.pop('max_length', None) + max_new_tokens = kwargs.pop('max_new_tokens', None) + + # for vllm, default to do_sample/greedy(depends on temperature). + # for hf, do_sample=false, num_beam=1 -> greedy(default) + # do_sample=ture, num_beam=1 -> sample + # do_sample=false, num_beam>1 -> beam_search + if not do_sample and num_beam > 1: + kwargs['use_beam_search'] = True + if max_length: + kwargs['max_tokens'] = max_length - len(prompts[0]) + if max_new_tokens: + kwargs['max_tokens'] = max_new_tokens + from vllm import SamplingParams sampling_params = SamplingParams(**kwargs) if isinstance(prompts[0], str): diff --git a/modelscope/pipelines/audio/ans_pipeline.py b/modelscope/pipelines/audio/ans_pipeline.py index 3719689c9..60a63722d 100644 --- a/modelscope/pipelines/audio/ans_pipeline.py +++ b/modelscope/pipelines/audio/ans_pipeline.py @@ -122,3 +122,126 @@ def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16), self.SAMPLE_RATE) return inputs + + +@PIPELINES.register_module( + Tasks.acoustic_noise_suppression, + module_name=Pipelines.speech_zipenhancer_ans_multiloss_16k_base) +class ANSZipEnhancerPipeline(Pipeline): + r"""ANS (Acoustic Noise Suppression) Inference Pipeline . + + When invoke the class with pipeline.__call__(), it accept only one parameter: + inputs(str): the path of wav file + """ + SAMPLE_RATE = 16000 + + def __init__(self, model, **kwargs): + """ + use `model` and `preprocessor` to create a kws pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + self.model.eval() + self.stream_mode = kwargs.get('stream_mode', False) + + def preprocess(self, inputs: Input, **preprocess_params) -> Dict[str, Any]: + if self.stream_mode: + raise TypeError('This model does not support stream mode!') + if isinstance(inputs, bytes): + data1, fs = sf.read(io.BytesIO(inputs)) + elif isinstance(inputs, str): + file_bytes = File.read(inputs) + data1, fs = sf.read(io.BytesIO(file_bytes)) + else: + raise TypeError(f'Unsupported type {type(inputs)}.') + if len(data1.shape) > 1: + data1 = data1[:, 0] + if fs != self.SAMPLE_RATE: + data1 = librosa.resample( + data1, orig_sr=fs, target_sr=self.SAMPLE_RATE) + data1 = audio_norm(data1) + data = data1.astype(np.float32) + inputs = np.reshape(data, [1, data.shape[0]]) + return {'ndarray': inputs, 'nsamples': data.shape[0]} + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + ndarray = inputs['ndarray'] + if isinstance(ndarray, torch.Tensor): + ndarray = ndarray.cpu().numpy() + nsamples = inputs['nsamples'] + decode_do_segement = False + window = 16000 * 2 # 2s + stride = int(window * 0.75) + print('inputs:{}'.format(ndarray.shape)) + b, t = ndarray.shape # size() + if t > window * 3: # 6s + decode_do_segement = True + print('decode_do_segement') + + if t < window: + ndarray = np.concatenate( + [ndarray, np.zeros((ndarray.shape[0], window - t))], 1) + elif decode_do_segement: + if t < window + stride: + padding = window + stride - t + print('padding: {}'.format(padding)) + ndarray = np.concatenate( + [ndarray, np.zeros((ndarray.shape[0], padding))], 1) + else: + if (t - window) % stride != 0: + # padding = t - (t - window) // stride * stride + padding = ( + (t - window) // stride + 1) * stride + window - t + print('padding: {}'.format(padding)) + ndarray = np.concatenate( + [ndarray, + np.zeros((ndarray.shape[0], padding))], 1) + # else: + # if (t - window) % stride != 0: + # padding = t - (t - window) // stride * stride + # print('padding: {}'.format(padding)) + # ndarray = np.concatenate( + # [ndarray, np.zeros((ndarray.shape[0], padding))], 1) + print('inputs after padding:{}'.format(ndarray.shape)) + with torch.no_grad(): + ndarray = torch.from_numpy(np.float32(ndarray)).to(self.device) + b, t = ndarray.shape + if decode_do_segement: + outputs = np.zeros(t) + give_up_length = (window - stride) // 2 + current_idx = 0 + while current_idx + window <= t: + # print('current_idx: {}'.format(current_idx)) + print( + '\rcurrent_idx: {} {:.2f}%'.format( + current_idx, current_idx * 100 / t), + end='') + tmp_input = dict(noisy=ndarray[:, current_idx:current_idx + + window]) + tmp_output = self.model( + tmp_input, )['wav_l2'][0].cpu().numpy() + end_index = current_idx + window - give_up_length + if current_idx == 0: + outputs[current_idx: + end_index] = tmp_output[:-give_up_length] + else: + outputs[current_idx + + give_up_length:end_index] = tmp_output[ + give_up_length:-give_up_length] + current_idx += stride + print('\rcurrent_idx: {} {:.2f}%'.format(current_idx, 100)) + else: + outputs = self.model( + dict(noisy=ndarray))['wav_l2'][0].cpu().numpy() + outputs = (outputs[:nsamples] * 32768).astype(np.int16).tobytes() + return {OutputKeys.OUTPUT_PCM: outputs} + + def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: + if 'output_path' in kwargs.keys(): + sf.write( + kwargs['output_path'], + np.frombuffer(inputs[OutputKeys.OUTPUT_PCM], dtype=np.int16), + self.SAMPLE_RATE) + return inputs diff --git a/modelscope/pipelines/audio/asr_inference_pipeline.py b/modelscope/pipelines/audio/asr_inference_pipeline.py deleted file mode 100644 index f825412c0..000000000 --- a/modelscope/pipelines/audio/asr_inference_pipeline.py +++ /dev/null @@ -1,591 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union - -import json -import yaml - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.preprocessors import WavToScp -from modelscope.utils.audio.audio_utils import (extract_pcm_from_wav, - generate_scp_from_url, - load_bytes_from_url, - update_local_model) -from modelscope.utils.constant import Frameworks, ModelFile, Tasks -from modelscope.utils.hub import snapshot_download -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['AutomaticSpeechRecognitionPipeline'] - - -@PIPELINES.register_module( - Tasks.auto_speech_recognition, module_name=Pipelines.asr_inference) -class AutomaticSpeechRecognitionPipeline(Pipeline): - """ASR Inference Pipeline - Example: - - >>> from modelscope.pipelines import pipeline - >>> from modelscope.utils.constant import Tasks - - >>> inference_pipeline = pipeline( - >>> task=Tasks.auto_speech_recognition, - >>> model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch') - - >>> rec_result = inference_pipeline( - >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav') - >>> print(rec_result) - - """ - - def __init__(self, - model: Union[Model, str] = None, - preprocessor: WavToScp = None, - vad_model: Optional[Union[Model, str]] = None, - vad_model_revision: Optional[str] = None, - punc_model: Optional[Union[Model, str]] = None, - punc_model_revision: Optional[str] = None, - lm_model: Optional[Union[Model, str]] = None, - lm_model_revision: Optional[str] = None, - timestamp_model: Optional[Union[Model, str]] = None, - timestamp_model_revision: Optional[str] = None, - ngpu: int = 1, - **kwargs): - """ - Use `model` and `preprocessor` to create an asr pipeline for prediction - Args: - model ('Model' or 'str'): - The pipeline handles three types of model: - - - A model instance - - A model local dir - - A model id in the model hub - preprocessor: - (list of) Preprocessor object - vad_model (Optional: 'Model' or 'str'): - voice activity detection model from model hub or local - example: 'damo/speech_fsmn_vad_zh-cn-16k-common-pytorch' - punc_model (Optional: 'Model' or 'str'): - punctuation model from model hub or local - example: 'damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch' - lm_model (Optional: 'Model' or 'str'): - language model from model hub or local - example: 'damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch' - timestamp_model (Optional: 'Model' or 'str'): - timestamp model from model hub or local - example: 'damo/speech_timestamp_predictor-v1-16k-offline' - output_dir('str'): - output dir path - batch_size('int'): - the batch size for inference - ngpu('int'): - the number of gpus, 0 indicates CPU mode - beam_size('int'): - beam size for decoding - ctc_weight('float'): - the CTC weight in joint decoding - lm_weight('float'): - lm weight - decoding_ind('int', defaults to 0): - decoding ind - decoding_mode('str', defaults to 'model1'): - decoding mode - vad_model_file('str'): - vad model file - vad_infer_config('str'): - VAD infer configuration - vad_cmvn_file('str'): - global CMVN file - punc_model_file('str'): - punc model file - punc_infer_config('str'): - punc infer config - param_dict('dict'): - extra kwargs - """ - super().__init__(model=model, preprocessor=preprocessor, **kwargs) - self.vad_model = vad_model - self.vad_model_revision = vad_model_revision - self.punc_model = punc_model - self.punc_model_revision = punc_model_revision - self.lm_model = lm_model - self.lm_model_revision = lm_model_revision - self.timestamp_model = timestamp_model - self.timestamp_model_revision = timestamp_model_revision - self.model_cfg = self.model.forward() - - self.cmd = self.get_cmd(kwargs, model) - from funasr.bin import asr_inference_launch - self.funasr_infer_modelscope = asr_inference_launch.inference_launch( - mode=self.cmd['mode'], - maxlenratio=self.cmd['maxlenratio'], - minlenratio=self.cmd['minlenratio'], - batch_size=self.cmd['batch_size'], - beam_size=self.cmd['beam_size'], - ngpu=ngpu, - ctc_weight=self.cmd['ctc_weight'], - lm_weight=self.cmd['lm_weight'], - penalty=self.cmd['penalty'], - log_level=self.cmd['log_level'], - asr_train_config=self.cmd['asr_train_config'], - asr_model_file=self.cmd['asr_model_file'], - cmvn_file=self.cmd['cmvn_file'], - lm_file=self.cmd['lm_file'], - token_type=self.cmd['token_type'], - key_file=self.cmd['key_file'], - lm_train_config=self.cmd['lm_train_config'], - bpemodel=self.cmd['bpemodel'], - allow_variable_data_keys=self.cmd['allow_variable_data_keys'], - output_dir=self.cmd['output_dir'], - dtype=self.cmd['dtype'], - seed=self.cmd['seed'], - ngram_weight=self.cmd['ngram_weight'], - nbest=self.cmd['nbest'], - num_workers=self.cmd['num_workers'], - vad_infer_config=self.cmd['vad_infer_config'], - vad_model_file=self.cmd['vad_model_file'], - vad_cmvn_file=self.cmd['vad_cmvn_file'], - punc_model_file=self.cmd['punc_model_file'], - punc_infer_config=self.cmd['punc_infer_config'], - timestamp_model_file=self.cmd['timestamp_model_file'], - timestamp_infer_config=self.cmd['timestamp_infer_config'], - timestamp_cmvn_file=self.cmd['timestamp_cmvn_file'], - outputs_dict=self.cmd['outputs_dict'], - param_dict=self.cmd['param_dict'], - token_num_relax=self.cmd['token_num_relax'], - decoding_ind=self.cmd['decoding_ind'], - decoding_mode=self.cmd['decoding_mode'], - fake_streaming=self.cmd['fake_streaming'], - model_lang=self.cmd['model_lang'], - **kwargs, - ) - - def __call__(self, - audio_in: Union[str, bytes], - audio_fs: int = None, - recog_type: str = None, - audio_format: str = None, - output_dir: str = None, - param_dict: dict = None, - **kwargs) -> Dict[str, Any]: - from funasr.utils import asr_utils - """ - Decoding the input audios - Args: - audio_in('str' or 'bytes'): - - A string containing a local path to a wav file - - A string containing a local path to a scp - - A string containing a wav url - - A bytes input - audio_fs('int'): - frequency of sample - recog_type('str'): - recog type - audio_format('str'): - audio format - output_dir('str'): - output dir - param_dict('dict'): - extra kwargs - Return: - A dictionary of result or a list of dictionary of result. - - The dictionary contain the following keys: - - **text** ('str') --The asr result. - """ - - # code base - # code_base = self.cmd['code_base'] - self.recog_type = recog_type - self.audio_format = audio_format - self.audio_fs = None - checking_audio_fs = None - self.raw_inputs = None - if output_dir is not None: - self.cmd['output_dir'] = output_dir - self.cmd['param_dict'] = param_dict - - if isinstance(audio_in, str): - # for funasr code, generate wav.scp from url or local path - if audio_in.startswith('http') or os.path.isfile(audio_in): - self.audio_in, self.raw_inputs = generate_scp_from_url( - audio_in) - else: - raise FileNotFoundError( - f'file {audio_in} NOT FOUND, please CHECK!') - elif isinstance(audio_in, bytes): - self.audio_in = audio_in - self.raw_inputs = None - else: - import numpy - import torch - if isinstance(audio_in, torch.Tensor): - self.audio_in = None - self.raw_inputs = audio_in - elif isinstance(audio_in, numpy.ndarray): - self.audio_in = None - self.raw_inputs = audio_in - - # set the sample_rate of audio_in if checking_audio_fs is valid - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs - - if recog_type is None or audio_format is None: - self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking( - audio_in=self.audio_in, - recog_type=recog_type, - audio_format=audio_format) - - if hasattr(asr_utils, - 'sample_rate_checking') and self.audio_in is not None: - checking_audio_fs = asr_utils.sample_rate_checking( - self.audio_in, self.audio_format) - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs - if audio_fs is not None: - self.cmd['fs']['audio_fs'] = audio_fs - else: - self.cmd['fs']['audio_fs'] = self.audio_fs - - output = self.preprocessor.forward(self.model_cfg, self.recog_type, - self.audio_format, self.audio_in, - self.audio_fs, self.cmd) - output = self.forward(output, **kwargs) - rst = self.postprocess(output) - return rst - - def get_cmd(self, extra_args, model_path) -> Dict[str, Any]: - if self.preprocessor is None: - self.preprocessor = WavToScp() - - outputs = self.preprocessor.config_checking(self.model_cfg) - # generate asr inference command - cmd = { - 'maxlenratio': 0.0, - 'minlenratio': 0.0, - 'batch_size': 1, - 'beam_size': 1, - 'ngpu': 1, - 'ctc_weight': 0.0, - 'lm_weight': 0.0, - 'penalty': 0.0, - 'log_level': 'ERROR', - 'asr_train_config': None, - 'asr_model_file': outputs['am_model_path'], - 'cmvn_file': None, - 'lm_train_config': None, - 'lm_file': None, - 'token_type': None, - 'key_file': None, - 'word_lm_train_config': None, - 'bpemodel': None, - 'allow_variable_data_keys': False, - 'output_dir': None, - 'dtype': 'float32', - 'seed': 0, - 'ngram_weight': 0.9, - 'nbest': 1, - 'num_workers': 0, - 'vad_infer_config': None, - 'vad_model_file': None, - 'vad_cmvn_file': None, - 'time_stamp_writer': True, - 'punc_infer_config': None, - 'punc_model_file': None, - 'timestamp_infer_config': None, - 'timestamp_model_file': None, - 'timestamp_cmvn_file': None, - 'outputs_dict': True, - 'param_dict': None, - 'model_type': outputs['model_type'], - 'idx_text': '', - 'sampled_ids': 'seq2seq/sampled_ids', - 'sampled_lengths': 'seq2seq/sampled_lengths', - 'model_lang': outputs['model_lang'], - 'code_base': outputs['code_base'], - 'mode': outputs['mode'], - 'fs': { - 'model_fs': None, - 'audio_fs': None - }, - 'fake_streaming': False, - } - - frontend_conf = None - token_num_relax = None - decoding_ind = None - decoding_mode = None - fake_streaming = False - if os.path.exists(outputs['am_model_config']): - config_file = open(outputs['am_model_config'], encoding='utf-8') - root = yaml.full_load(config_file) - config_file.close() - if 'frontend_conf' in root: - frontend_conf = root['frontend_conf'] - if os.path.exists(outputs['asr_model_config']): - config_file = open(outputs['asr_model_config'], encoding='utf-8') - root = yaml.full_load(config_file) - config_file.close() - if 'token_num_relax' in root: - token_num_relax = root['token_num_relax'] - if 'decoding_ind' in root: - decoding_ind = root['decoding_ind'] - if 'decoding_mode' in root: - decoding_mode = root['decoding_mode'] - - cmd['beam_size'] = root['beam_size'] - cmd['penalty'] = root['penalty'] - cmd['maxlenratio'] = root['maxlenratio'] - cmd['minlenratio'] = root['minlenratio'] - cmd['ctc_weight'] = root['ctc_weight'] - cmd['lm_weight'] = root['lm_weight'] - cmd['asr_train_config'] = outputs['am_model_config'] - cmd['lm_file'] = outputs['lm_model_path'] - cmd['lm_train_config'] = outputs['lm_model_config'] - cmd['batch_size'] = outputs['model_config']['batch_size'] - cmd['frontend_conf'] = frontend_conf - if frontend_conf is not None and 'fs' in frontend_conf: - cmd['fs']['model_fs'] = frontend_conf['fs'] - cmd['token_num_relax'] = token_num_relax - cmd['decoding_ind'] = decoding_ind - cmd['decoding_mode'] = decoding_mode - cmd['fake_streaming'] = fake_streaming - if outputs.__contains__('mvn_file'): - cmd['cmvn_file'] = outputs['mvn_file'] - model_config = self.model_cfg['model_config'] - if model_config.__contains__('vad_model') and self.vad_model is None: - self.vad_model = model_config['vad_model'] - if model_config.__contains__('vad_model_revision'): - self.vad_model_revision = model_config['vad_model_revision'] - if model_config.__contains__('punc_model') and self.punc_model is None: - self.punc_model = model_config['punc_model'] - if model_config.__contains__('punc_model_revision'): - self.punc_model_revision = model_config['punc_model_revision'] - if model_config.__contains__( - 'timestamp_model') and self.timestamp_model is None: - self.timestamp_model = model_config['timestamp_model'] - if model_config.__contains__('timestamp_model_revision'): - self.timestamp_model_revision = model_config[ - 'timestamp_model_revision'] - update_local_model(model_config, model_path, extra_args) - self.load_vad_model(cmd) - self.load_punc_model(cmd) - self.load_lm_model(cmd) - self.load_timestamp_model(cmd) - - user_args_dict = [ - 'output_dir', - 'batch_size', - 'mode', - 'ngpu', - 'beam_size', - 'ctc_weight', - 'lm_weight', - 'decoding_ind', - 'decoding_mode', - 'vad_model_file', - 'vad_infer_config', - 'vad_cmvn_file', - 'punc_model_file', - 'punc_infer_config', - 'param_dict', - 'fake_streaming', - ] - - for user_args in user_args_dict: - if user_args in extra_args: - if extra_args.get(user_args) is not None: - cmd[user_args] = extra_args[user_args] - del extra_args[user_args] - - return cmd - - def load_vad_model(self, cmd): - if self.vad_model is not None and self.vad_model != '': - if os.path.exists(self.vad_model): - vad_model = self.vad_model - else: - vad_model = snapshot_download( - self.vad_model, revision=self.vad_model_revision) - logger.info('loading vad model from {0} ...'.format(vad_model)) - config_path = os.path.join(vad_model, ModelFile.CONFIGURATION) - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - cmd['vad_model_file'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['vad_model_name']) - cmd['vad_infer_config'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['vad_model_config']) - cmd['vad_cmvn_file'] = os.path.join( - model_dir, model_cfg['model']['model_config']['vad_mvn_file']) - if 'vad' not in cmd['mode']: - cmd['mode'] = cmd['mode'] + '_vad' - - def load_punc_model(self, cmd): - if self.punc_model is not None and self.punc_model != '': - if os.path.exists(self.punc_model): - punc_model = self.punc_model - else: - punc_model = snapshot_download( - self.punc_model, revision=self.punc_model_revision) - logger.info( - 'loading punctuation model from {0} ...'.format(punc_model)) - config_path = os.path.join(punc_model, ModelFile.CONFIGURATION) - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - cmd['punc_model_file'] = os.path.join( - model_dir, model_cfg['model']['punc_model_name']) - cmd['punc_infer_config'] = os.path.join( - model_dir, - model_cfg['model']['punc_model_config']['punc_config']) - if 'punc' not in cmd['mode']: - cmd['mode'] = cmd['mode'] + '_punc' - - def load_lm_model(self, cmd): - if self.lm_model is not None and self.lm_model != '': - if os.path.exists(self.lm_model): - lm_model = self.lm_model - else: - lm_model = snapshot_download( - self.lm_model, revision=self.lm_model_revision) - logger.info('loading language model from {0} ...'.format(lm_model)) - config_path = os.path.join(lm_model, ModelFile.CONFIGURATION) - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - cmd['lm_file'] = os.path.join( - model_dir, model_cfg['model']['model_config']['lm_model_name']) - cmd['lm_train_config'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['lm_model_config']) - - # FIXME - def load_timestamp_model(self, cmd): - if self.timestamp_model is not None and self.timestamp_model != '': - if os.path.exists(self.timestamp_model): - timestamp_model = self.timestamp_model - else: - timestamp_model = snapshot_download( - self.timestamp_model, - revision=self.timestamp_model_revision) - logger.info( - 'loading timestamp model from {0} ...'.format(timestamp_model)) - config_path = os.path.join(timestamp_model, - ModelFile.CONFIGURATION) - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - cmd['timestamp_model_file'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['timestamp_model_file']) - cmd['timestamp_infer_config'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['timestamp_infer_config']) - cmd['timestamp_cmvn_file'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['timestamp_cmvn_file']) - - def forward(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: - """Decoding - """ - - logger.info(f"Decoding with {inputs['audio_format']} files ...") - - data_cmd: Sequence[Tuple[str, str, str]] - if isinstance(self.audio_in, bytes): - data_cmd = [self.audio_in, 'speech', 'bytes'] - elif isinstance(self.audio_in, str): - data_cmd = [self.audio_in, 'speech', 'sound'] - elif self.raw_inputs is not None: - data_cmd = None - - # generate asr inference command - self.cmd['name_and_type'] = data_cmd - self.cmd['raw_inputs'] = self.raw_inputs - self.cmd['audio_in'] = self.audio_in - - inputs['asr_result'] = self.run_inference(self.cmd, **kwargs) - - return inputs - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - """process the asr results - """ - from funasr.utils import asr_utils - - logger.info('Computing the result of ASR ...') - - rst = {} - - # single wav or pcm task - if inputs['recog_type'] == 'wav': - if 'asr_result' in inputs and len(inputs['asr_result']) > 0: - for key, value in inputs['asr_result'][0].items(): - if key == 'value': - if len(value) > 0: - rst[OutputKeys.TEXT] = value - elif key != 'key': - rst[key] = value - - # run with datasets, and audio format is waveform or kaldi_ark or tfrecord - elif inputs['recog_type'] != 'wav': - inputs['reference_list'] = self.ref_list_tidy(inputs) - - inputs['datasets_result'] = asr_utils.compute_wer( - hyp_list=inputs['asr_result'], - ref_list=inputs['reference_list']) - - else: - raise ValueError('recog_type and audio_format are mismatching') - - if 'datasets_result' in inputs: - rst[OutputKeys.TEXT] = inputs['datasets_result'] - - return rst - - def ref_list_tidy(self, inputs: Dict[str, Any]) -> List[Any]: - ref_list = [] - - if inputs['audio_format'] == 'tfrecord': - # should assemble idx + txt - with open(inputs['reference_text'], 'r', encoding='utf-8') as r: - text_lines = r.readlines() - - with open(inputs['idx_text'], 'r', encoding='utf-8') as i: - idx_lines = i.readlines() - - j: int = 0 - while j < min(len(text_lines), len(idx_lines)): - idx_str = idx_lines[j].strip() - text_str = text_lines[j].strip().replace(' ', '') - item = {'key': idx_str, 'value': text_str} - ref_list.append(item) - j += 1 - - else: - # text contain idx + sentence - with open(inputs['reference_text'], 'r', encoding='utf-8') as f: - lines = f.readlines() - - for line in lines: - line_item = line.split(None, 1) - if len(line_item) > 1: - item = { - 'key': line_item[0], - 'value': line_item[1].strip('\n') - } - ref_list.append(item) - - return ref_list - - def run_inference(self, cmd, **kwargs): - asr_result = self.funasr_infer_modelscope(cmd['name_and_type'], - cmd['raw_inputs'], - cmd['output_dir'], cmd['fs'], - cmd['param_dict'], **kwargs) - - return asr_result diff --git a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py index 9e0eb7f5c..f80dbf4cd 100644 --- a/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py +++ b/modelscope/pipelines/audio/asr_wenet_inference_pipeline.py @@ -35,7 +35,7 @@ def __call__(self, audio_fs: int = None, recog_type: str = None, audio_format: str = None) -> Dict[str, Any]: - from funasr.utils import asr_utils + # from funasr.utils import asr_utils self.recog_type = recog_type self.audio_format = audio_format @@ -54,17 +54,17 @@ def __call__(self, if checking_audio_fs is not None: self.audio_fs = checking_audio_fs - if recog_type is None or audio_format is None: - self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking( - audio_in=self.audio_in, - recog_type=recog_type, - audio_format=audio_format) + # if recog_type is None or audio_format is None: + # self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking( + # audio_in=self.audio_in, + # recog_type=recog_type, + # audio_format=audio_format) - if hasattr(asr_utils, 'sample_rate_checking'): - checking_audio_fs = asr_utils.sample_rate_checking( - self.audio_in, self.audio_format) - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs + # if hasattr(asr_utils, 'sample_rate_checking'): + # checking_audio_fs = asr_utils.sample_rate_checking( + # self.audio_in, self.audio_format) + # if checking_audio_fs is not None: + # self.audio_fs = checking_audio_fs inputs = { 'audio': self.audio_in, diff --git a/modelscope/pipelines/audio/speaker_verification_pipeline.py b/modelscope/pipelines/audio/audio_quantization_pipeline.py similarity index 52% rename from modelscope/pipelines/audio/speaker_verification_pipeline.py rename to modelscope/pipelines/audio/audio_quantization_pipeline.py index c23058be4..76115db5f 100644 --- a/modelscope/pipelines/audio/speaker_verification_pipeline.py +++ b/modelscope/pipelines/audio/audio_quantization_pipeline.py @@ -3,6 +3,7 @@ import shutil from typing import Any, Dict, List, Sequence, Tuple, Union +import numpy as np import yaml from modelscope.metainfo import Pipelines @@ -10,34 +11,36 @@ from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.audio.audio_utils import (generate_scp_for_sv, - generate_sv_scp_from_url, +from modelscope.utils.audio.audio_utils import (generate_scp_from_url, update_local_model) from modelscope.utils.constant import Frameworks, Tasks from modelscope.utils.logger import get_logger logger = get_logger() -__all__ = ['SpeakerVerificationPipeline'] +__all__ = ['AudioQuantizationPipeline'] @PIPELINES.register_module( - Tasks.speaker_verification, module_name=Pipelines.sv_inference) -class SpeakerVerificationPipeline(Pipeline): - """Speaker Verification Inference Pipeline - use `model` to create a Speaker Verification pipeline. + Tasks.audio_quantization, + module_name=Pipelines.audio_quantization_inference) +class AudioQuantizationPipeline(Pipeline): + """Audio Quantization Inference Pipeline + use `model` to create a audio quantization pipeline. Args: - model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub. + model (AudioQuantizationPipeline): A model instance, or a model local dir, or a model id in the model hub. kwargs (dict, `optional`): Extra kwargs passed into the preprocessor's constructor. Examples: >>> from modelscope.pipelines import pipeline - >>> pipeline_sv = pipeline( - >>> task=Tasks.speaker_verification, model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch') - >>> audio_in=('sv_example_enroll.wav', 'sv_example_same.wav') - >>> print(pipeline_sv(audio_in)) - >>> # {'label': ['Same', 'Different'], 'scores': [0.8540488358969999, 0.14595116410300013]} + >>> from modelscope.utils.constant import Tasks + >>> pipeline_aq = pipeline( + >>> task=Tasks.audio_quantization, + >>> model='damo/audio_codec-encodec-zh_en-general-16k-nq32ds640-pytorch' + >>> ) + >>> audio_in='example.wav' + >>> print(pipeline_aq(audio_in)) """ @@ -51,8 +54,8 @@ def __init__(self, self.model_cfg = self.model.forward() self.cmd = self.get_cmd(kwargs, model) - from funasr.bin import sv_inference_launch - self.funasr_infer_modelscope = sv_inference_launch.inference_launch( + from funcodec.bin import codec_inference + self.funasr_infer_modelscope = codec_inference.inference_modelscope( mode=self.cmd['mode'], output_dir=self.cmd['output_dir'], batch_size=self.cmd['batch_size'], @@ -62,13 +65,14 @@ def __init__(self, num_workers=self.cmd['num_workers'], log_level=self.cmd['log_level'], key_file=self.cmd['key_file'], - sv_train_config=self.cmd['sv_train_config'], - sv_model_file=self.cmd['sv_model_file'], + config_file=self.cmd['config_file'], + model_file=self.cmd['model_file'], model_tag=self.cmd['model_tag'], allow_variable_data_keys=self.cmd['allow_variable_data_keys'], streaming=self.cmd['streaming'], - embedding_node=self.cmd['embedding_node'], - sv_threshold=self.cmd['sv_threshold'], + sampling_rate=self.cmd['sampling_rate'], + bit_width=self.cmd['bit_width'], + use_scale=self.cmd['use_scale'], param_dict=self.cmd['param_dict'], **kwargs, ) @@ -78,7 +82,7 @@ def __call__(self, output_dir: str = None, param_dict: dict = None) -> Dict[str, Any]: if len(audio_in) == 0: - raise ValueError('The input of sv should not be null.') + raise ValueError('The input should not be null.') else: self.audio_in = audio_in if output_dir is not None: @@ -94,19 +98,11 @@ def postprocess(self, inputs: list) -> Dict[str, Any]: """ rst = {} for i in range(len(inputs)): - # for single input, re-formate the output - # audio_in: - # list/tuple: return speaker verification scores - # single wav/bytes: return speaker embedding if len(inputs) == 1 and i == 0: - if isinstance(self.audio_in, tuple) or isinstance( - self.audio_in, list): - score = inputs[0]['value'] - rst[OutputKeys.LABEL] = ['Same', 'Different'] - rst[OutputKeys.SCORES] = [score / 100.0, 1 - score / 100.0] - else: - embedding = inputs[0]['value'] - rst[OutputKeys.SPK_EMBEDDING] = embedding + recon_wav = inputs[0]['value'] + output_wav = recon_wav.cpu().numpy()[0] + output_wav = (output_wav * (2**15)).astype(np.int16) + rst[OutputKeys.OUTPUT_WAV] = output_wav else: # for multiple inputs rst[inputs[i]['key']] = inputs[i]['value'] @@ -115,10 +111,12 @@ def postprocess(self, inputs: list) -> Dict[str, Any]: def get_cmd(self, extra_args, model_path) -> Dict[str, Any]: # generate asr inference command mode = self.model_cfg['model_config']['mode'] - sv_model_path = self.model_cfg['model_path'] - sv_model_config = os.path.join( + _model_path = os.path.join( + self.model_cfg['model_workspace'], + self.model_cfg['model_config']['model_file']) + _model_config = os.path.join( self.model_cfg['model_workspace'], - self.model_cfg['model_config']['sv_model_config']) + self.model_cfg['model_config']['config_file']) update_local_model(self.model_cfg['model_config'], model_path, extra_args) cmd = { @@ -131,25 +129,27 @@ def get_cmd(self, extra_args, model_path) -> Dict[str, Any]: 'num_workers': 0, 'log_level': 'ERROR', 'key_file': None, - 'sv_model_file': sv_model_path, - 'sv_train_config': sv_model_config, + 'model_file': _model_path, + 'config_file': _model_config, 'model_tag': None, 'allow_variable_data_keys': True, 'streaming': False, - 'embedding_node': 'resnet1_dense', - 'sv_threshold': 0.9465, + 'sampling_rate': 16000, + 'bit_width': 8000, + 'use_scale': True, 'param_dict': None, } user_args_dict = [ 'output_dir', 'batch_size', 'ngpu', - 'embedding_node', - 'sv_threshold', 'log_level', 'allow_variable_data_keys', 'streaming', 'num_workers', + 'sampling_rate', + 'bit_width', + 'use_scale', 'param_dict', ] @@ -181,69 +181,34 @@ def forward(self, audio_in: Union[tuple, str, Any] = None) -> list: """Decoding """ # log file_path/url or tuple (str, str) - if isinstance(audio_in, str) or \ - (isinstance(audio_in, tuple) and all(isinstance(item, str) for item in audio_in)): - logger.info(f'Speaker Verification Processing: {audio_in} ...') + if isinstance(audio_in, str): + logger.info(f'Audio Quantization Processing: {audio_in} ...') else: logger.info( - f'Speaker Verification Processing: {str(audio_in)[:100]} ...') + f'Audio Quantization Processing: {str(audio_in)[:100]} ...') data_cmd, raw_inputs = None, None - if isinstance(audio_in, tuple) or isinstance(audio_in, list): - # generate audio_scp - assert len(audio_in) == 2 - if isinstance(audio_in[0], str): - # for scp inputs - if len(audio_in[0].split(',')) == 3 and audio_in[0].split( - ',')[0].endswith('.scp'): - if len(audio_in[1].split(',')) == 3 and audio_in[1].split( - ',')[0].endswith('.scp'): - data_cmd = [ - tuple(audio_in[0].split(',')), - tuple(audio_in[1].split(',')) - ] - # for single-file inputs - else: - audio_scp_1, audio_scp_2 = generate_sv_scp_from_url( - audio_in) - if isinstance(audio_scp_1, bytes) and isinstance( - audio_scp_2, bytes): - data_cmd = [(audio_scp_1, 'speech', 'bytes'), - (audio_scp_2, 'ref_speech', 'bytes')] - else: - data_cmd = [(audio_scp_1, 'speech', 'sound'), - (audio_scp_2, 'ref_speech', 'sound')] - # for raw bytes inputs - elif isinstance(audio_in[0], bytes): - data_cmd = [(audio_in[0], 'speech', 'bytes'), - (audio_in[1], 'ref_speech', 'bytes')] + if isinstance(audio_in, str): + # for scp inputs + if len(audio_in.split(',')) == 3: + data_cmd = [tuple(audio_in.split(','))] + # for single-file inputs else: - raise TypeError('Unsupported data type.') + audio_scp, _ = generate_scp_from_url(audio_in) + raw_inputs = audio_scp + # for raw bytes + elif isinstance(audio_in, bytes): + data_cmd = (audio_in, 'speech', 'bytes') + # for ndarray and tensor inputs else: - if isinstance(audio_in, str): - # for scp inputs - if len(audio_in.split(',')) == 3: - data_cmd = [audio_in.split(',')] - # for single-file inputs - else: - audio_scp = generate_scp_for_sv(audio_in) - if isinstance(audio_scp, bytes): - data_cmd = [(audio_scp, 'speech', 'bytes')] - else: - data_cmd = [(audio_scp, 'speech', 'sound')] - # for raw bytes - elif isinstance(audio_in, bytes): - data_cmd = [(audio_in, 'speech', 'bytes')] - # for ndarray and tensor inputs + import torch + import numpy as np + if isinstance(audio_in, torch.Tensor): + raw_inputs = audio_in + elif isinstance(audio_in, np.ndarray): + raw_inputs = audio_in else: - import torch - import numpy as np - if isinstance(audio_in, torch.Tensor): - raw_inputs = audio_in - elif isinstance(audio_in, np.ndarray): - raw_inputs = audio_in - else: - raise TypeError('Unsupported data type.') + raise TypeError('Unsupported data type.') self.cmd['name_and_type'] = data_cmd self.cmd['raw_inputs'] = raw_inputs diff --git a/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py b/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py new file mode 100644 index 000000000..52de7d799 --- /dev/null +++ b/modelscope/pipelines/audio/codec_based_synthesis_pipeline.py @@ -0,0 +1,276 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from typing import Any, Dict, Optional, Union + +import json +import numpy as np + +from modelscope.metainfo import Pipelines +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.audio.audio_utils import (generate_scp_from_url, + update_local_model) +from modelscope.utils.constant import Frameworks, ModelFile, Tasks +from modelscope.utils.hub import snapshot_download +from modelscope.utils.logger import get_logger + +__all__ = ['LauraCodecTTSPipeline'] + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.text_to_speech, module_name=Pipelines.laura_codec_tts_inference) +class LauraCodecTTSPipeline(Pipeline): + """Laura-style Codec-based TTS Inference Pipeline + use `model` to create a TTS pipeline. + + Args: + model (LauraCodecTTSPipeline): A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. + Examples: + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> my_pipeline = pipeline( + >>> task=Tasks.text_to_speech, + >>> model='damo/speech_synthesizer-laura-en-libritts-16k-codec_nq2-pytorch' + >>> ) + >>> text='nothing was to be done but to put about, and return in disappointment towards the north.' + >>> prompt_text='one of these is context' + >>> prompt_speech='example/prompt.wav' + >>> print(my_pipeline(text)) + + """ + + def __init__(self, + model: Union[Model, str] = None, + codec_model: Optional[Union[Model, str]] = None, + codec_model_revision: Optional[str] = None, + ngpu: int = 1, + **kwargs): + """use `model` to create an asr pipeline for prediction + """ + super().__init__(model=model, **kwargs) + self.model_cfg = self.model.forward() + self.codec_model = codec_model + self.codec_model_revision = codec_model_revision + self.cmd = self.get_cmd(kwargs, model) + + from funcodec.bin import text2audio_inference + self.funasr_infer_modelscope = text2audio_inference.inference_func( + mode=self.cmd['mode'], + output_dir=self.cmd['output_dir'], + batch_size=self.cmd['batch_size'], + dtype=self.cmd['dtype'], + ngpu=ngpu, + seed=self.cmd['seed'], + num_workers=self.cmd['num_workers'], + log_level=self.cmd['log_level'], + key_file=self.cmd['key_file'], + config_file=self.cmd['config_file'], + model_file=self.cmd['model_file'], + model_tag=self.cmd['model_tag'], + allow_variable_data_keys=self.cmd['allow_variable_data_keys'], + streaming=self.cmd['streaming'], + text_emb_model=self.cmd['text_emb_model'], + beam_size=self.cmd['beam_size'], + sampling=self.cmd['sampling'], + continual=self.cmd['continual'], + tokenize_to_phone=self.cmd['tokenize_to_phone'], + exclude_prompt=self.cmd['exclude_prompt'], + codec_config_file=self.cmd['codec_config_file'], + codec_model_file=self.cmd['codec_model_file'], + param_dict=self.cmd['param_dict']) + + def __call__(self, + text: Union[tuple, str, Any] = None, + prompt_text: Union[tuple, str, Any] = None, + prompt_audio: Union[tuple, str, Any] = None, + output_dir: str = None, + param_dict: dict = None) -> Dict[str, Any]: + if len(text) == 0: + raise ValueError('The input should not be null.') + if output_dir is not None: + self.cmd['output_dir'] = output_dir + self.cmd['param_dict'] = param_dict + + output = self.forward(text, prompt_text, prompt_audio) + result = self.postprocess(output) + return result + + def postprocess(self, inputs: list) -> Dict[str, Any]: + """Postprocessing + """ + rst = {} + for i in range(len(inputs)): + if len(inputs) == 1 and i == 0: + recon_wav = inputs[0]['value']['gen'] + rst[OutputKeys.OUTPUT_WAV] = recon_wav.cpu().numpy()[0] + else: + # for multiple inputs + rst[inputs[i]['key']] = inputs[i]['value']['gen'] + return rst + + def load_codec_model(self, cmd): + if self.codec_model is not None and self.codec_model != '': + if os.path.exists(self.codec_model): + codec_model = self.codec_model + else: + codec_model = snapshot_download( + self.codec_model, revision=self.codec_model_revision) + logger.info('loading codec model from {0} ...'.format(codec_model)) + config_path = os.path.join(codec_model, ModelFile.CONFIGURATION) + model_cfg = json.loads(open(config_path).read()) + model_dir = os.path.dirname(config_path) + cmd['codec_model_file'] = os.path.join( + model_dir, model_cfg['model']['model_config']['model_file']) + cmd['codec_config_file'] = os.path.join( + model_dir, model_cfg['model']['model_config']['config_file']) + + def get_cmd(self, extra_args, model_path) -> Dict[str, Any]: + # generate asr inference command + mode = self.model_cfg['model_config']['mode'] + _model_path = os.path.join( + self.model_cfg['model_workspace'], + self.model_cfg['model_config']['model_file']) + _model_config = os.path.join( + self.model_cfg['model_workspace'], + self.model_cfg['model_config']['config_file']) + update_local_model(self.model_cfg['model_config'], model_path, + extra_args) + + cmd = { + 'mode': mode, + 'output_dir': None, + 'batch_size': 1, + 'dtype': 'float32', + 'ngpu': 1, # 0: only CPU, ngpu>=1: gpu number if cuda is available + 'seed': 0, + 'num_workers': 0, + 'log_level': 'ERROR', + 'key_file': None, + 'model_file': _model_path, + 'config_file': _model_config, + 'model_tag': None, + 'allow_variable_data_keys': True, + 'streaming': False, + 'beam_size': 1, + 'sampling': 25, + 'text_emb_model': None, + 'continual': True, + 'tokenize_to_phone': True, + 'exclude_prompt': True, + 'codec_model_file': None, + 'codec_config_file': None, + 'param_dict': None, + } + user_args_dict = [ + 'output_dir', + 'batch_size', + 'ngpu', + 'log_level', + 'allow_variable_data_keys', + 'streaming', + 'num_workers', + 'sampling_rate', + 'bit_width', + 'use_scale', + 'param_dict', + ] + + model_config = self.model_cfg['model_config'] + if model_config.__contains__( + 'codec_model') and self.codec_model is None: + self.codec_model = model_config['codec_model'] + if model_config.__contains__( + 'codec_model_revision') and self.codec_model_revision is None: + self.codec_model_revision = model_config['codec_model_revision'] + self.load_codec_model(cmd) + + # re-write the config with configure.json + for user_args in user_args_dict: + if (user_args in self.model_cfg['model_config'] + and self.model_cfg['model_config'][user_args] is not None): + if isinstance(cmd[user_args], dict) and isinstance( + self.model_cfg['model_config'][user_args], dict): + cmd[user_args].update( + self.model_cfg['model_config'][user_args]) + else: + cmd[user_args] = self.model_cfg['model_config'][user_args] + + # rewrite the config with user args + for user_args in user_args_dict: + if user_args in extra_args: + if extra_args.get(user_args) is not None: + if isinstance(cmd[user_args], dict) and isinstance( + extra_args[user_args], dict): + cmd[user_args].update(extra_args[user_args]) + else: + cmd[user_args] = extra_args[user_args] + del extra_args[user_args] + + return cmd + + def forward(self, + text: Union[tuple, str, Any] = None, + prompt_text: Union[tuple, str, Any] = None, + prompt_audio: Union[tuple, str, Any] = None, + **forward_params) -> list: + """Decoding + """ + if isinstance(text, str): + logger.info(f'Generate speech for: {text} ...') + + data_cmd, raw_inputs = None, None + # process text input + # for scp inputs + if len(text.split(',')) == 3: + data_cmd = [tuple(text.split(','))] + # for single-file inputs + else: + raw_inputs = [text] + + if prompt_text is not None and prompt_audio is not None: + if len(prompt_text.split(',')) == 3: + data_cmd.append(tuple(prompt_text.split(','))) + else: + raw_inputs.append(prompt_text) + + if isinstance(prompt_audio, str): + if len(prompt_audio.split(',')) == 3: + data_cmd.append(tuple(prompt_audio.split(','))) + else: + audio_path, _ = generate_scp_from_url(prompt_audio) + raw_inputs.append(audio_path) + # for ndarray and tensor inputs + else: + import torch + if isinstance(prompt_audio, torch.Tensor): + raw_inputs.append(prompt_audio.numpy()) + elif isinstance(prompt_audio, np.ndarray): + raw_inputs.append(prompt_audio) + else: + raise TypeError( + f'Unsupported prompt audio type {type(prompt_audio)}.') + + self.cmd['name_and_type'] = data_cmd + self.cmd['raw_inputs'] = raw_inputs + result = self.run_inference(self.cmd) + + return result + + def run_inference(self, cmd): + if self.framework == Frameworks.torch: + sv_result = self.funasr_infer_modelscope( + data_path_and_name_and_type=cmd['name_and_type'], + raw_inputs=cmd['raw_inputs'], + output_dir_v2=cmd['output_dir'], + param_dict=cmd['param_dict']) + else: + raise ValueError('model type is mismatching') + + return sv_result diff --git a/modelscope/pipelines/audio/funasr_pipeline.py b/modelscope/pipelines/audio/funasr_pipeline.py new file mode 100644 index 000000000..4b66b6ab2 --- /dev/null +++ b/modelscope/pipelines/audio/funasr_pipeline.py @@ -0,0 +1,75 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os +from typing import Any, Dict, List, Sequence, Tuple, Union + +import json +import yaml + +from modelscope.metainfo import Pipelines +from modelscope.models import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.audio.audio_utils import (generate_scp_from_url, + update_local_model) +from modelscope.utils.constant import Frameworks, ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + +__all__ = ['FunASRPipeline'] + + +@PIPELINES.register_module( + Tasks.auto_speech_recognition, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.voice_activity_detection, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.language_score_prediction, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.punctuation, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.speaker_diarization, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.speaker_verification, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.speech_separation, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.speech_timestamp, module_name=Pipelines.funasr_pipeline) +@PIPELINES.register_module( + Tasks.emotion_recognition, module_name=Pipelines.funasr_pipeline) +class FunASRPipeline(Pipeline): + """Voice Activity Detection Inference Pipeline + use `model` to create a Voice Activity Detection pipeline. + + Args: + model: A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the preprocessor's constructor. + + Example: + >>> from modelscope.pipelines import pipeline + >>> p = pipeline( + >>> task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch') + >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm' + >>> print(p(audio_in)) + + """ + + def __init__(self, model: Union[Model, str] = None, **kwargs): + """use `model` to create an vad pipeline for prediction + """ + super().__init__(model=model, **kwargs) + + def __call__(self, *args, **kwargs) -> Dict[str, Any]: + """ + Decoding the input audios + Args: + input('str' or 'bytes'): + Return: + a list of dictionary of result. + """ + + output = self.model(*args, **kwargs) + + return output diff --git a/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py b/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py index 1b9c7f799..0865bdfef 100644 --- a/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py +++ b/modelscope/pipelines/audio/language_recognition_eres2net_pipeline.py @@ -55,24 +55,34 @@ def __call__(self, in_audios: Union[str, list, np.ndarray], out_file: str = None): wavs = self.preprocess(in_audios) - results = self.forward(wavs) - outputs = self.postprocess(results, in_audios, out_file) + scores, results = self.forward(wavs) + outputs = self.postprocess(results, scores, in_audios, out_file) return outputs def forward(self, inputs: list): + scores = [] results = [] for x in inputs: - results.append(self.model(x).item()) - return results + score, result = self.model(x) + scores.append(score.tolist()) + results.append(result.item()) + return scores, results def postprocess(self, inputs: list, + scores: list, in_audios: Union[str, list, np.ndarray], out_file=None): if isinstance(in_audios, str): - output = {OutputKeys.TEXT: self.languages[inputs[0]]} + output = { + OutputKeys.TEXT: self.languages[inputs[0]], + OutputKeys.SCORE: scores + } else: - output = {OutputKeys.TEXT: [self.languages[i] for i in inputs]} + output = { + OutputKeys.TEXT: [self.languages[i] for i in inputs], + OutputKeys.SCORE: scores + } if out_file is not None: out_lines = [] for i, audio in enumerate(in_audios): diff --git a/modelscope/pipelines/audio/language_recognition_pipeline.py b/modelscope/pipelines/audio/language_recognition_pipeline.py index 00adcfff4..353232d7b 100644 --- a/modelscope/pipelines/audio/language_recognition_pipeline.py +++ b/modelscope/pipelines/audio/language_recognition_pipeline.py @@ -55,24 +55,34 @@ def __call__(self, in_audios: Union[str, list, np.ndarray], out_file: str = None): wavs = self.preprocess(in_audios) - results = self.forward(wavs) - outputs = self.postprocess(results, in_audios, out_file) + scores, results = self.forward(wavs) + outputs = self.postprocess(results, scores, in_audios, out_file) return outputs def forward(self, inputs: list): + scores = [] results = [] for x in inputs: - results.append(self.model(x).item()) - return results + score, result = self.model(x) + scores.append(score.tolist()) + results.append(result.item()) + return scores, results def postprocess(self, inputs: list, + scores: list, in_audios: Union[str, list, np.ndarray], out_file=None): if isinstance(in_audios, str): - output = {OutputKeys.TEXT: self.languages[inputs[0]]} + output = { + OutputKeys.TEXT: self.languages[inputs[0]], + OutputKeys.SCORE: scores + } else: - output = {OutputKeys.TEXT: [self.languages[i] for i in inputs]} + output = { + OutputKeys.TEXT: [self.languages[i] for i in inputs], + OutputKeys.SCORE: scores + } if out_file is not None: out_lines = [] for i, audio in enumerate(in_audios): diff --git a/modelscope/pipelines/audio/lm_infer_pipeline.py b/modelscope/pipelines/audio/lm_infer_pipeline.py deleted file mode 100644 index e1524ebd3..000000000 --- a/modelscope/pipelines/audio/lm_infer_pipeline.py +++ /dev/null @@ -1,230 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict, Union - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.audio.audio_utils import (generate_text_from_url, - update_local_model) -from modelscope.utils.config import Config -from modelscope.utils.constant import Frameworks, ModelFile, Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['LanguageModelPipeline'] - - -@PIPELINES.register_module( - Tasks.language_score_prediction, module_name=Pipelines.lm_inference) -class LanguageModelPipeline(Pipeline): - """Language Model Inference Pipeline - - Example: - >>> from modelscope.pipelines import pipeline - >>> from modelscope.utils.constant import Tasks - - >>> inference_pipeline = pipeline( - >>> task=Tasks.language_score_prediction, - >>> model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch') - >>> text_in='hello 大 家 好 呀' - >>> print(inference_pipeline(text_in)) - - """ - - def __init__(self, - model: Union[Model, str] = None, - ngpu: int = 1, - **kwargs): - """ - Use `model` to create a LM pipeline for prediction - Args: - model ('Model' or 'str'): - The pipeline handles three types of model: - - - A model instance - - A model local dir - - A model id in the model hub - output_dir('str'): - output dir path - batch_size('int'): - the batch size for inference - ngpu('int'): - the number of gpus, 0 indicates CPU mode - model_file('str'): - LM model file - train_config('str'): - LM infer configuration - num_workers('int'): - the number of workers used for DataLoader - log_level('str'): - log level - log_base('float', defaults to 10.0): - the base of logarithm for Perplexity - split_with_space('bool'): - split the input sentence by space - seg_dict_file('str'): - seg dict file - param_dict('dict'): - extra kwargs - """ - super().__init__(model=model, **kwargs) - config_path = os.path.join(model, ModelFile.CONFIGURATION) - self.cmd = self.get_cmd(config_path, kwargs, model) - - from funasr.bin import lm_inference_launch - self.funasr_infer_modelscope = lm_inference_launch.inference_launch( - mode=self.cmd['mode'], - batch_size=self.cmd['batch_size'], - dtype=self.cmd['dtype'], - ngpu=ngpu, - seed=self.cmd['seed'], - num_workers=self.cmd['num_workers'], - log_level=self.cmd['log_level'], - key_file=self.cmd['key_file'], - train_config=self.cmd['train_config'], - model_file=self.cmd['model_file'], - log_base=self.cmd['log_base'], - split_with_space=self.cmd['split_with_space'], - seg_dict_file=self.cmd['seg_dict_file'], - output_dir=self.cmd['output_dir'], - param_dict=self.cmd['param_dict'], - **kwargs, - ) - - def __call__(self, - text_in: str = None, - output_dir: str = None, - param_dict: dict = None) -> Dict[str, Any]: - """ - Compute PPL - Args: - text_in('str'): - - A text str input - - A local text file input endswith .txt or .scp - - A url text file input - output_dir('str'): - output dir - param_dict('dict'): - extra kwargs - Return: - A dictionary of result or a list of dictionary of result. - - The dictionary contain the following keys: - - **text** ('str') --The PPL result. - """ - if len(text_in) == 0: - raise ValueError('The input of lm should not be null.') - else: - self.text_in = text_in - if output_dir is not None: - self.cmd['output_dir'] = output_dir - if param_dict is not None: - self.cmd['param_dict'] = param_dict - - output = self.forward(self.text_in) - result = self.postprocess(output) - return result - - def postprocess(self, inputs: list) -> Dict[str, Any]: - """Postprocessing - """ - rst = {} - for i in range(len(inputs)): - if i == 0: - text = inputs[0]['value'] - if len(text) > 0: - rst[OutputKeys.TEXT] = text - else: - rst[inputs[i]['key']] = inputs[i]['value'] - return rst - - def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]: - # generate inference command - model_cfg = Config.from_file(config_path) - model_dir = os.path.dirname(config_path) - mode = model_cfg.model['model_config']['mode'] - lm_model_path = os.path.join( - model_dir, model_cfg.model['model_config']['lm_model_name']) - lm_model_config = os.path.join( - model_dir, model_cfg.model['model_config']['lm_model_config']) - seg_dict_file = None - if 'seg_dict_file' in model_cfg.model['model_config']: - seg_dict_file = os.path.join( - model_dir, model_cfg.model['model_config']['seg_dict_file']) - update_local_model(model_cfg.model['model_config'], model_path, - extra_args) - - cmd = { - 'mode': mode, - 'batch_size': 1, - 'dtype': 'float32', - 'ngpu': 1, # 0: only CPU, ngpu>=1: gpu number if cuda is available - 'seed': 0, - 'num_workers': 0, - 'log_level': 'ERROR', - 'key_file': None, - 'train_config': lm_model_config, - 'model_file': lm_model_path, - 'log_base': 10.0, - 'allow_variable_data_keys': False, - 'split_with_space': True, - 'seg_dict_file': seg_dict_file, - 'output_dir': None, - 'param_dict': None, - } - - user_args_dict = [ - 'batch_size', - 'ngpu', - 'num_workers', - 'log_level', - 'train_config', - 'model_file', - 'log_base', - 'split_with_space', - 'seg_dict_file', - 'output_dir', - 'param_dict', - ] - - for user_args in user_args_dict: - if user_args in extra_args: - if extra_args.get(user_args) is not None: - cmd[user_args] = extra_args[user_args] - del extra_args[user_args] - - return cmd - - def forward(self, text_in: str = None) -> list: - """Decoding - """ - logger.info('Compute PPL : {0} ...'.format(text_in)) - # generate text_in - text_file, raw_inputs = generate_text_from_url(text_in) - data_cmd = None - if raw_inputs is None: - data_cmd = [(text_file, 'text', 'text')] - elif text_file is None and raw_inputs is not None: - data_cmd = None - - self.cmd['name_and_type'] = data_cmd - self.cmd['raw_inputs'] = raw_inputs - lm_result = self.run_inference(self.cmd) - - return lm_result - - def run_inference(self, cmd): - if self.framework == Frameworks.torch: - lm_result = self.funasr_infer_modelscope( - data_path_and_name_and_type=cmd['name_and_type'], - raw_inputs=cmd['raw_inputs'], - output_dir_v2=cmd['output_dir'], - param_dict=cmd['param_dict']) - else: - raise ValueError('model type is mismatching') - - return lm_result diff --git a/modelscope/pipelines/audio/punctuation_processing_pipeline.py b/modelscope/pipelines/audio/punctuation_processing_pipeline.py deleted file mode 100644 index 4e41e0c09..000000000 --- a/modelscope/pipelines/audio/punctuation_processing_pipeline.py +++ /dev/null @@ -1,183 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -import shutil -from typing import Any, Dict, List, Sequence, Tuple, Union - -import yaml - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.audio.audio_utils import (generate_text_from_url, - update_local_model) -from modelscope.utils.constant import Frameworks, Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['PunctuationProcessingPipeline'] - - -@PIPELINES.register_module( - Tasks.punctuation, module_name=Pipelines.punc_inference) -class PunctuationProcessingPipeline(Pipeline): - """Punctuation Processing Inference Pipeline - use `model` to create a Punctuation Processing pipeline. - - Args: - model (PunctuationProcessingPipeline): A model instance, or a model local dir, or a model id in the model hub. - kwargs (dict, `optional`): - Extra kwargs passed into the preprocessor's constructor. - Examples - >>> from modelscope.pipelines import pipeline - >>> pipeline_punc = pipeline( - >>> task=Tasks.punctuation, model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch') - >>> text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt' - >>> print(pipeline_punc(text_in)) - - """ - - def __init__(self, - model: Union[Model, str] = None, - ngpu: int = 1, - **kwargs): - """use `model` to create an asr pipeline for prediction - """ - super().__init__(model=model, **kwargs) - self.model_cfg = self.model.forward() - self.cmd = self.get_cmd(kwargs, model) - - from funasr.bin import punc_inference_launch - self.funasr_infer_modelscope = punc_inference_launch.inference_launch( - mode=self.cmd['mode'], - batch_size=self.cmd['batch_size'], - dtype=self.cmd['dtype'], - ngpu=ngpu, - seed=self.cmd['seed'], - num_workers=self.cmd['num_workers'], - log_level=self.cmd['log_level'], - key_file=self.cmd['key_file'], - train_config=self.cmd['train_config'], - model_file=self.cmd['model_file'], - output_dir=self.cmd['output_dir'], - param_dict=self.cmd['param_dict'], - **kwargs, - ) - - def __call__(self, - text_in: str = None, - output_dir: str = None, - cache: List[Any] = None, - param_dict: dict = None) -> Dict[str, Any]: - if len(text_in) == 0: - raise ValueError('The input of punctuation should not be null.') - else: - self.text_in = text_in - if output_dir is not None: - self.cmd['output_dir'] = output_dir - if cache is not None: - self.cmd['cache'] = cache - if param_dict is not None: - self.cmd['param_dict'] = param_dict - - output = self.forward(self.text_in) - result = self.postprocess(output) - return result - - def postprocess(self, inputs: list) -> Dict[str, Any]: - """Postprocessing - """ - rst = {} - for i in range(len(inputs)): - if i == 0: - for key, value in inputs[0].items(): - if key == 'value': - if len(value) > 0: - rst[OutputKeys.TEXT] = value - elif key != 'key': - rst[key] = value - else: - rst[inputs[i]['key']] = inputs[i]['value'] - return rst - - def get_cmd(self, extra_args, model_path) -> Dict[str, Any]: - # generate inference command - lang = self.model_cfg['model_config']['lang'] - punc_model_path = self.model_cfg['punc_model_path'] - punc_model_config = os.path.join( - self.model_cfg['model_workspace'], - self.model_cfg['model_config']['punc_config']) - mode = self.model_cfg['model_config']['mode'] - update_local_model(self.model_cfg['model_config'], model_path, - extra_args) - cmd = { - 'mode': mode, - 'batch_size': 1, - 'dtype': 'float32', - 'ngpu': 1, # 0: only CPU, ngpu>=1: gpu number if cuda is available - 'seed': 0, - 'num_workers': 0, - 'log_level': 'ERROR', - 'key_file': None, - 'train_config': punc_model_config, - 'model_file': punc_model_path, - 'output_dir': None, - 'lang': lang, - 'cache': None, - 'param_dict': None, - } - - user_args_dict = [ - 'batch_size', - 'dtype', - 'ngpu', - 'seed', - 'num_workers', - 'log_level', - 'train_config', - 'model_file', - 'output_dir', - 'lang', - 'param_dict', - ] - - for user_args in user_args_dict: - if user_args in extra_args: - if extra_args.get(user_args) is not None: - cmd[user_args] = extra_args[user_args] - del extra_args[user_args] - - return cmd - - def forward(self, text_in: str = None) -> list: - """Decoding - """ - logger.info('Punctuation Processing: {0} ...'.format(text_in)) - # generate text_in - text_file, raw_inputs = generate_text_from_url(text_in) - if raw_inputs is None: - data_cmd = [(text_file, 'text', 'text')] - elif text_file is None and raw_inputs is not None: - data_cmd = None - - self.cmd['name_and_type'] = data_cmd - self.cmd['raw_inputs'] = raw_inputs - punc_result = self.run_inference(self.cmd) - - return punc_result - - def run_inference(self, cmd): - punc_result = '' - if self.framework == Frameworks.torch: - punc_result = self.funasr_infer_modelscope( - data_path_and_name_and_type=cmd['name_and_type'], - raw_inputs=cmd['raw_inputs'], - output_dir_v2=cmd['output_dir'], - cache=cmd['cache'], - param_dict=cmd['param_dict']) - else: - raise ValueError('model type is mismatching') - - return punc_result diff --git a/modelscope/pipelines/audio/segmentation_clustering_pipeline.py b/modelscope/pipelines/audio/segmentation_clustering_pipeline.py index e4810bcfe..9f6961e2d 100644 --- a/modelscope/pipelines/audio/segmentation_clustering_pipeline.py +++ b/modelscope/pipelines/audio/segmentation_clustering_pipeline.py @@ -179,16 +179,17 @@ def preprocess(self, audio: Union[str, np.ndarray, list]) -> list: if not hasattr(self, 'vad_pipeline'): self.vad_pipeline = pipeline( task=Tasks.voice_activity_detection, - model=self.config['vad_model']) - vad_time = self.vad_pipeline(audio, audio_fs=self.fs) + model=self.config['vad_model'], + model_revision='v2.0.2') + vad_time = self.vad_pipeline( + audio, fs=self.fs, is_final=True)[0]['value'] vad_segments = [] - if isinstance(vad_time['text'], str): - vad_time_list = ast.literal_eval(vad_time['text']) - elif isinstance(vad_time['text'], list): - vad_time_list = vad_time['text'] + if isinstance(vad_time, str): + vad_time_list = ast.literal_eval(vad_time) + elif isinstance(vad_time, list): + vad_time_list = vad_time else: - raise ValueError('Incorrect vad result. Get %s' % - (type(vad_time['text']))) + raise ValueError('Incorrect vad result. Get %s' % (type(vad_time))) for t in vad_time_list: st = int(t[0]) / 1000 ed = int(t[1]) / 1000 diff --git a/modelscope/pipelines/audio/speaker_diarization_pipeline.py b/modelscope/pipelines/audio/speaker_diarization_pipeline.py deleted file mode 100644 index dfb808d04..000000000 --- a/modelscope/pipelines/audio/speaker_diarization_pipeline.py +++ /dev/null @@ -1,287 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -import shutil -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union - -import json -import numpy -import yaml - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.audio.audio_utils import (generate_scp_for_sv, - generate_sd_scp_from_url, - update_local_model) -from modelscope.utils.constant import Frameworks, ModelFile, Tasks -from modelscope.utils.hub import snapshot_download -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['SpeakerDiarizationPipeline'] - - -@PIPELINES.register_module( - Tasks.speaker_diarization, - module_name=Pipelines.speaker_diarization_inference) -class SpeakerDiarizationPipeline(Pipeline): - """Speaker Diarization Inference Pipeline - use `model` to create a Speaker Diarization pipeline. - - Args: - model (SpeakerDiarizationPipeline): A model instance, or a model local dir, or a model id in the model hub. - kwargs (dict, `optional`): - Extra kwargs passed into the preprocessor's constructor. - Examples: - >>> from modelscope.pipelines import pipeline - >>> pipeline_sd = pipeline( - >>> task=Tasks.speaker_diarization, model='damo/xxxxxxxxxxxxx') - >>> audio_in=('','','','') - >>> print(pipeline_sd(audio_in)) - - """ - - def __init__(self, - model: Union[Model, str] = None, - sv_model: Optional[Union[Model, str]] = None, - sv_model_revision: Optional[str] = None, - ngpu: int = 1, - **kwargs): - """use `model` to create a speaker diarization pipeline for prediction - Args: - model ('Model' or 'str'): - The pipeline handles three types of model: - - - A model instance - - A model local dir - - A model id in the model hub - sv_model (Optional: 'Model' or 'str'): - speaker verification model from model hub or local - example: 'damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch' - sv_model_revision (Optional: 'str'): - speaker verfication model revision from model hub - """ - super().__init__(model=model, **kwargs) - self.model_cfg = None - config_path = os.path.join(model, ModelFile.CONFIGURATION) - self.sv_model = sv_model - self.sv_model_revision = sv_model_revision - self.cmd = self.get_cmd(config_path, kwargs, model) - - from funasr.bin import diar_inference_launch - self.funasr_infer_modelscope = diar_inference_launch.inference_launch( - mode=self.cmd['mode'], - output_dir=self.cmd['output_dir'], - batch_size=self.cmd['batch_size'], - dtype=self.cmd['dtype'], - ngpu=ngpu, - seed=self.cmd['seed'], - num_workers=self.cmd['num_workers'], - log_level=self.cmd['log_level'], - key_file=self.cmd['key_file'], - diar_train_config=self.cmd['diar_train_config'], - diar_model_file=self.cmd['diar_model_file'], - model_tag=self.cmd['model_tag'], - allow_variable_data_keys=self.cmd['allow_variable_data_keys'], - streaming=self.cmd['streaming'], - smooth_size=self.cmd['smooth_size'], - dur_threshold=self.cmd['dur_threshold'], - out_format=self.cmd['out_format'], - param_dict=self.cmd['param_dict'], - **kwargs, - ) - - def __call__(self, - audio_in: Union[tuple, str, Any] = None, - output_dir: str = None, - param_dict: dict = None) -> Dict[str, Any]: - """ - Decoding the input audios - Args: - audio_in('str' or 'bytes'): - - A string containing a local path to a wav file - - A string containing a local path to a scp - - A string containing a wav url - - A bytes input - output_dir('str'): - output dir - param_dict('dict'): - extra kwargs - Return: - A dictionary of result or a list of dictionary of result. - - The dictionary contain the following keys: - - **text** ('str') --The speaker diarization result. - """ - if len(audio_in) == 0: - raise ValueError('The input of sv should not be null.') - else: - self.audio_in = audio_in - if output_dir is not None: - self.cmd['output_dir'] = output_dir - self.cmd['param_dict'] = param_dict - - output = self.forward(self.audio_in) - result = self.postprocess(output) - return result - - def postprocess(self, inputs: list) -> Dict[str, Any]: - """Postprocessing - """ - rst = {} - for i in range(len(inputs)): - # for demo service - if i == 0 and len(inputs) == 1: - rst[OutputKeys.TEXT] = inputs[0]['value'] - else: - rst[inputs[i]['key']] = inputs[i]['value'] - return rst - - def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]: - self.model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - # generate sd inference command - mode = self.model_cfg['model']['model_config']['mode'] - diar_model_path = os.path.join( - model_dir, - self.model_cfg['model']['model_config']['diar_model_name']) - diar_model_config = os.path.join( - model_dir, - self.model_cfg['model']['model_config']['diar_model_config']) - update_local_model(self.model_cfg['model']['model_config'], model_path, - extra_args) - cmd = { - 'mode': mode, - 'output_dir': None, - 'batch_size': 1, - 'dtype': 'float32', - 'ngpu': 1, # 0: only CPU, ngpu>=1: gpu number if cuda is available - 'seed': 0, - 'num_workers': 0, - 'log_level': 'ERROR', - 'key_file': None, - 'diar_model_file': diar_model_path, - 'diar_train_config': diar_model_config, - 'model_tag': None, - 'allow_variable_data_keys': True, - 'streaming': False, - 'smooth_size': 83, - 'dur_threshold': 10, - 'out_format': 'vad', - 'param_dict': { - 'sv_model_file': None, - 'sv_train_config': None - }, - } - user_args_dict = [ - 'mode', - 'output_dir', - 'batch_size', - 'ngpu', - 'log_level', - 'allow_variable_data_keys', - 'streaming', - 'num_workers', - 'smooth_size', - 'dur_threshold', - 'out_format', - 'param_dict', - ] - model_config = self.model_cfg['model']['model_config'] - if model_config.__contains__('sv_model') and self.sv_model != '': - self.sv_model = model_config['sv_model'] - if model_config.__contains__('sv_model_revision'): - self.sv_model_revision = model_config['sv_model_revision'] - self.load_sv_model(cmd) - - # rewrite the config with user args - for user_args in user_args_dict: - if user_args in extra_args: - if extra_args.get(user_args) is not None: - if isinstance(cmd[user_args], dict) and isinstance( - extra_args[user_args], dict): - cmd[user_args].update(extra_args[user_args]) - else: - cmd[user_args] = extra_args[user_args] - del extra_args[user_args] - - return cmd - - def load_sv_model(self, cmd): - if self.sv_model is not None and self.sv_model != '': - if os.path.exists(self.sv_model): - sv_model = self.sv_model - else: - sv_model = snapshot_download( - self.sv_model, revision=self.sv_model_revision) - logger.info( - 'loading speaker verification model from {0} ...'.format( - sv_model)) - config_path = os.path.join(sv_model, ModelFile.CONFIGURATION) - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - cmd['param_dict']['sv_model_file'] = os.path.join( - model_dir, model_cfg['model']['model_config']['sv_model_name']) - cmd['param_dict']['sv_train_config'] = os.path.join( - model_dir, - model_cfg['model']['model_config']['sv_model_config']) - - def forward(self, audio_in: Union[tuple, str, Any] = None) -> list: - """Decoding - """ - # log file_path/url or tuple (str, str) - if isinstance(audio_in, str) or \ - (isinstance(audio_in, tuple) and all(isinstance(item, str) for item in audio_in)): - logger.info(f'Speaker Verification Processing: {audio_in} ...') - else: - logger.info( - f'Speaker Verification Processing: {str(audio_in)[:100]} ...') - - data_cmd, raw_inputs = None, None - if isinstance(audio_in, tuple) or isinstance(audio_in, list): - # generate audio_scp - if isinstance(audio_in[0], str): - # for scp inputs - if len(audio_in[0].split(',')) == 3 and audio_in[0].split( - ',')[0].endswith('.scp'): - data_cmd = [] - for audio_cmd in audio_in: - if len(audio_cmd.split(',')) == 3 and audio_cmd.split( - ',')[0].endswith('.scp'): - data_cmd.append(tuple(audio_cmd.split(','))) - # for audio-list inputs - else: - raw_inputs = generate_sd_scp_from_url(audio_in) - # for raw bytes inputs - elif isinstance(audio_in[0], (bytes, numpy.ndarray)): - raw_inputs = audio_in - else: - raise TypeError( - 'Unsupported data type, it must be data_name_type_path, ' - 'file_path, url, bytes or numpy.ndarray') - else: - raise TypeError( - 'audio_in must be a list of data_name_type_path, file_path, ' - 'url, bytes or numpy.ndarray') - - self.cmd['name_and_type'] = data_cmd - self.cmd['raw_inputs'] = raw_inputs - result = self.run_inference(self.cmd) - - return result - - def run_inference(self, cmd): - if self.framework == Frameworks.torch: - diar_result = self.funasr_infer_modelscope( - data_path_and_name_and_type=cmd['name_and_type'], - raw_inputs=cmd['raw_inputs'], - output_dir_v2=cmd['output_dir'], - param_dict=cmd['param_dict']) - else: - raise ValueError( - 'framework is mismatching, which should be pytorch') - - return diar_result diff --git a/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py b/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py index ba28ed6e2..507e761df 100644 --- a/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py +++ b/modelscope/pipelines/audio/speaker_verification_eres2net_pipeline.py @@ -1,6 +1,7 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import io +import os from typing import Any, Dict, List, Union import numpy as np diff --git a/modelscope/pipelines/audio/speaker_verification_eres2netv2_pipeline.py b/modelscope/pipelines/audio/speaker_verification_eres2netv2_pipeline.py new file mode 100644 index 000000000..edac14446 --- /dev/null +++ b/modelscope/pipelines/audio/speaker_verification_eres2netv2_pipeline.py @@ -0,0 +1,160 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import io +import os +from typing import Any, Dict, List, Union + +import numpy as np +import soundfile as sf +import torch +import torchaudio + +from modelscope.fileio import File +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import InputModel, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.speaker_verification, + module_name=Pipelines.speaker_verification_eres2netv2) +class ERes2NetV2_Pipeline(Pipeline): + """Speaker Verification Inference Pipeline + use `model` to create a Speaker Verification pipeline. + + Args: + model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the pipeline's constructor. + Example: + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> p = pipeline( + >>> task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k') + >>> print(p([audio_1, audio_2])) + + """ + + def __init__(self, model: InputModel, **kwargs): + """use `model` to create a speaker verification pipeline for prediction + Args: + model (str): a valid offical model id + """ + super().__init__(model=model, **kwargs) + self.model_config = self.model.model_config + self.config = self.model.other_config + self.thr = self.config['yesOrno_thr'] + self.save_dict = {} + + def __call__(self, + in_audios: Union[np.ndarray, list], + save_dir: str = None, + output_emb: bool = False, + thr: float = None): + if thr is not None: + self.thr = thr + if self.thr < -1 or self.thr > 1: + raise ValueError( + 'modelscope error: the thr value should be in [-1, 1], but found to be %f.' + % self.thr) + wavs = self.preprocess(in_audios) + embs = self.forward(wavs) + outputs = self.postprocess(embs, in_audios, save_dir) + if output_emb: + self.save_dict['outputs'] = outputs + self.save_dict['embs'] = embs.numpy() + return self.save_dict + else: + return outputs + + def forward(self, inputs: list): + embs = [] + for x in inputs: + embs.append(self.model(x)) + embs = torch.cat(embs) + return embs + + def postprocess(self, + inputs: torch.Tensor, + in_audios: Union[np.ndarray, list], + save_dir=None): + if isinstance(in_audios[0], str) and save_dir is not None: + # save the embeddings + os.makedirs(save_dir, exist_ok=True) + for i, p in enumerate(in_audios): + save_path = os.path.join( + save_dir, '%s.npy' % + (os.path.basename(p).rsplit('.', 1)[0])) + np.save(save_path, inputs[i].numpy()) + + if len(inputs) == 2: + # compute the score + score = self.compute_cos_similarity(inputs[0], inputs[1]) + score = round(score, 5) + if score >= self.thr: + ans = 'yes' + else: + ans = 'no' + output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans} + else: + output = {OutputKeys.TEXT: 'No similarity score output'} + + return output + + def preprocess(self, inputs: Union[np.ndarray, list]): + output = [] + for i in range(len(inputs)): + if isinstance(inputs[i], str): + file_bytes = File.read(inputs[i]) + data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') + if len(data.shape) == 2: + data = data[:, 0] + data = torch.from_numpy(data).unsqueeze(0) + if fs != self.model_config['sample_rate']: + logger.warning( + 'The sample rate of audio is not %d, resample it.' + % self.model_config['sample_rate']) + data, fs = torchaudio.sox_effects.apply_effects_tensor( + data, + fs, + effects=[[ + 'rate', + str(self.model_config['sample_rate']) + ]]) + data = data.squeeze(0) + elif isinstance(inputs[i], np.ndarray): + assert len( + inputs[i].shape + ) == 1, 'modelscope error: Input array should be [N, T]' + data = inputs[i] + if data.dtype in ['int16', 'int32', 'int64']: + data = (data / (1 << 15)).astype('float32') + else: + data = data.astype('float32') + data = torch.from_numpy(data) + else: + raise ValueError( + 'modelscope error: The input type is restricted to audio address and nump array.' + ) + output.append(data) + return output + + def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor], + emb2: Union[np.ndarray, torch.Tensor]) -> float: + if isinstance(emb1, np.ndarray): + emb1 = torch.from_numpy(emb1) + if isinstance(emb2, np.ndarray): + emb2 = torch.from_numpy(emb2) + if len(emb1.shape): + emb1 = emb1.unsqueeze(0) + if len(emb2.shape): + emb2 = emb2.unsqueeze(0) + assert len(emb1.shape) == 2 and len(emb2.shape) == 2 + cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6) + cosine = cos(emb1, emb2) + return cosine.item() diff --git a/modelscope/pipelines/audio/speaker_verification_res2net_pipeline.py b/modelscope/pipelines/audio/speaker_verification_res2net_pipeline.py new file mode 100644 index 000000000..308190601 --- /dev/null +++ b/modelscope/pipelines/audio/speaker_verification_res2net_pipeline.py @@ -0,0 +1,160 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import io +import os +from typing import Any, Dict, List, Union + +import numpy as np +import soundfile as sf +import torch +import torchaudio + +from modelscope.fileio import File +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import InputModel, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.speaker_verification, + module_name=Pipelines.speaker_verification_res2net) +class Res2Net_Pipeline(Pipeline): + """Speaker Verification Inference Pipeline + use `model` to create a Speaker Verification pipeline. + + Args: + model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the pipeline's constructor. + Example: + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> p = pipeline( + >>> task=Tasks.speaker_verification, model='iic/speech_res2net_sv_zh-cn_3dspeaker_16k') + >>> print(p([audio_1, audio_2])) + + """ + + def __init__(self, model: InputModel, **kwargs): + """use `model` to create a speaker verification pipeline for prediction + Args: + model (str): a valid offical model id + """ + super().__init__(model=model, **kwargs) + self.model_config = self.model.model_config + self.config = self.model.other_config + self.thr = self.config['yesOrno_thr'] + self.save_dict = {} + + def __call__(self, + in_audios: Union[np.ndarray, list], + save_dir: str = None, + output_emb: bool = False, + thr: float = None): + if thr is not None: + self.thr = thr + if self.thr < -1 or self.thr > 1: + raise ValueError( + 'modelscope error: the thr value should be in [-1, 1], but found to be %f.' + % self.thr) + wavs = self.preprocess(in_audios) + embs = self.forward(wavs) + outputs = self.postprocess(embs, in_audios, save_dir) + if output_emb: + self.save_dict['outputs'] = outputs + self.save_dict['embs'] = embs.numpy() + return self.save_dict + else: + return outputs + + def forward(self, inputs: list): + embs = [] + for x in inputs: + embs.append(self.model(x)) + embs = torch.cat(embs) + return embs + + def postprocess(self, + inputs: torch.Tensor, + in_audios: Union[np.ndarray, list], + save_dir=None): + if isinstance(in_audios[0], str) and save_dir is not None: + # save the embeddings + os.makedirs(save_dir, exist_ok=True) + for i, p in enumerate(in_audios): + save_path = os.path.join( + save_dir, '%s.npy' % + (os.path.basename(p).rsplit('.', 1)[0])) + np.save(save_path, inputs[i].numpy()) + + if len(inputs) == 2: + # compute the score + score = self.compute_cos_similarity(inputs[0], inputs[1]) + score = round(score, 5) + if score >= self.thr: + ans = 'yes' + else: + ans = 'no' + output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans} + else: + output = {OutputKeys.TEXT: 'No similarity score output'} + + return output + + def preprocess(self, inputs: Union[np.ndarray, list]): + output = [] + for i in range(len(inputs)): + if isinstance(inputs[i], str): + file_bytes = File.read(inputs[i]) + data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') + if len(data.shape) == 2: + data = data[:, 0] + data = torch.from_numpy(data).unsqueeze(0) + if fs != self.model_config['sample_rate']: + logger.warning( + 'The sample rate of audio is not %d, resample it.' + % self.model_config['sample_rate']) + data, fs = torchaudio.sox_effects.apply_effects_tensor( + data, + fs, + effects=[[ + 'rate', + str(self.model_config['sample_rate']) + ]]) + data = data.squeeze(0) + elif isinstance(inputs[i], np.ndarray): + assert len( + inputs[i].shape + ) == 1, 'modelscope error: Input array should be [N, T]' + data = inputs[i] + if data.dtype in ['int16', 'int32', 'int64']: + data = (data / (1 << 15)).astype('float32') + else: + data = data.astype('float32') + data = torch.from_numpy(data) + else: + raise ValueError( + 'modelscope error: The input type is restricted to audio address and nump array.' + ) + output.append(data) + return output + + def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor], + emb2: Union[np.ndarray, torch.Tensor]) -> float: + if isinstance(emb1, np.ndarray): + emb1 = torch.from_numpy(emb1) + if isinstance(emb2, np.ndarray): + emb2 = torch.from_numpy(emb2) + if len(emb1.shape): + emb1 = emb1.unsqueeze(0) + if len(emb2.shape): + emb2 = emb2.unsqueeze(0) + assert len(emb1.shape) == 2 and len(emb2.shape) == 2 + cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6) + cosine = cos(emb1, emb2) + return cosine.item() diff --git a/modelscope/pipelines/audio/speaker_verification_resnet_pipeline.py b/modelscope/pipelines/audio/speaker_verification_resnet_pipeline.py new file mode 100644 index 000000000..8b2b59dba --- /dev/null +++ b/modelscope/pipelines/audio/speaker_verification_resnet_pipeline.py @@ -0,0 +1,160 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import io +import os +from typing import Any, Dict, List, Union + +import numpy as np +import soundfile as sf +import torch +import torchaudio + +from modelscope.fileio import File +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import InputModel, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.speaker_verification, + module_name=Pipelines.speaker_verification_resnet) +class ResNet_Pipeline(Pipeline): + """Speaker Verification Inference Pipeline + use `model` to create a Speaker Verification pipeline. + + Args: + model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the pipeline's constructor. + Example: + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> p = pipeline( + >>> task=Tasks.speaker_verification, model='iic/speech_resnet34_sv_zh-cn_3dspeaker_16k') + >>> print(p([audio_1, audio_2])) + + """ + + def __init__(self, model: InputModel, **kwargs): + """use `model` to create a speaker verification pipeline for prediction + Args: + model (str): a valid offical model id + """ + super().__init__(model=model, **kwargs) + self.model_config = self.model.model_config + self.config = self.model.other_config + self.thr = self.config['yesOrno_thr'] + self.save_dict = {} + + def __call__(self, + in_audios: Union[np.ndarray, list], + save_dir: str = None, + output_emb: bool = False, + thr: float = None): + if thr is not None: + self.thr = thr + if self.thr < -1 or self.thr > 1: + raise ValueError( + 'modelscope error: the thr value should be in [-1, 1], but found to be %f.' + % self.thr) + wavs = self.preprocess(in_audios) + embs = self.forward(wavs) + outputs = self.postprocess(embs, in_audios, save_dir) + if output_emb: + self.save_dict['outputs'] = outputs + self.save_dict['embs'] = embs.numpy() + return self.save_dict + else: + return outputs + + def forward(self, inputs: list): + embs = [] + for x in inputs: + embs.append(self.model(x)) + embs = torch.cat(embs) + return embs + + def postprocess(self, + inputs: torch.Tensor, + in_audios: Union[np.ndarray, list], + save_dir=None): + if isinstance(in_audios[0], str) and save_dir is not None: + # save the embeddings + os.makedirs(save_dir, exist_ok=True) + for i, p in enumerate(in_audios): + save_path = os.path.join( + save_dir, '%s.npy' % + (os.path.basename(p).rsplit('.', 1)[0])) + np.save(save_path, inputs[i].numpy()) + + if len(inputs) == 2: + # compute the score + score = self.compute_cos_similarity(inputs[0], inputs[1]) + score = round(score, 5) + if score >= self.thr: + ans = 'yes' + else: + ans = 'no' + output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans} + else: + output = {OutputKeys.TEXT: 'No similarity score output'} + + return output + + def preprocess(self, inputs: Union[np.ndarray, list]): + output = [] + for i in range(len(inputs)): + if isinstance(inputs[i], str): + file_bytes = File.read(inputs[i]) + data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') + if len(data.shape) == 2: + data = data[:, 0] + data = torch.from_numpy(data).unsqueeze(0) + if fs != self.model_config['sample_rate']: + logger.warning( + 'The sample rate of audio is not %d, resample it.' + % self.model_config['sample_rate']) + data, fs = torchaudio.sox_effects.apply_effects_tensor( + data, + fs, + effects=[[ + 'rate', + str(self.model_config['sample_rate']) + ]]) + data = data.squeeze(0) + elif isinstance(inputs[i], np.ndarray): + assert len( + inputs[i].shape + ) == 1, 'modelscope error: Input array should be [N, T]' + data = inputs[i] + if data.dtype in ['int16', 'int32', 'int64']: + data = (data / (1 << 15)).astype('float32') + else: + data = data.astype('float32') + data = torch.from_numpy(data) + else: + raise ValueError( + 'modelscope error: The input type is restricted to audio address and nump array.' + ) + output.append(data) + return output + + def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor], + emb2: Union[np.ndarray, torch.Tensor]) -> float: + if isinstance(emb1, np.ndarray): + emb1 = torch.from_numpy(emb1) + if isinstance(emb2, np.ndarray): + emb2 = torch.from_numpy(emb2) + if len(emb1.shape): + emb1 = emb1.unsqueeze(0) + if len(emb2.shape): + emb2 = emb2.unsqueeze(0) + assert len(emb1.shape) == 2 and len(emb2.shape) == 2 + cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6) + cosine = cos(emb1, emb2) + return cosine.item() diff --git a/modelscope/pipelines/audio/speaker_verification_sdpn_pipeline.py b/modelscope/pipelines/audio/speaker_verification_sdpn_pipeline.py new file mode 100644 index 000000000..352d448ba --- /dev/null +++ b/modelscope/pipelines/audio/speaker_verification_sdpn_pipeline.py @@ -0,0 +1,110 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import io +from typing import Any, Dict, List, Union + +import soundfile as sf +import torch + +from modelscope.fileio import File +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import InputModel, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.speaker_verification, + module_name=Pipelines.speaker_verification_sdpn) +class SDPNPipeline(Pipeline): + """Speaker Verification Inference Pipeline + use `model` to create a Speaker Verification pipeline. + + Args: + model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the pipeline's constructor. + Example: + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> p = pipeline( + >>> task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k') + >>> print(p([audio_1, audio_2])) + + """ + + def __init__(self, model: InputModel, **kwargs): + """use `model` to create a speaker verification pipeline for prediction + Args: + model (str): a valid offical model id + """ + super().__init__(model=model, **kwargs) + self.model_config = self.model.model_config + self.config = self.model.other_config + self.thr = self.config['yesOrno_thr'] + + def __call__(self, + in_audios: List[str], + thr: float = None) -> Dict[str, Any]: + if thr is not None: + self.thr = thr + if self.thr < -1 or self.thr > 1: + raise ValueError( + 'modelscope error: the thr value should be in [-1, 1], but found to be %f.' + % self.thr) + outputs = self.preprocess(in_audios) + outputs = self.forward(outputs) + outputs = self.postprocess(outputs) + + return outputs + + def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + emb1 = self.model(inputs['data1']) + emb2 = self.model(inputs['data2']) + + return {'emb1': emb1, 'emb2': emb2} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + score = self.compute_cos_similarity(inputs['emb1'], inputs['emb2']) + score = round(score, 5) + if score >= self.thr: + ans = 'yes' + else: + ans = 'no' + + return {OutputKeys.SCORE: score, OutputKeys.TEXT: ans} + + def preprocess(self, inputs: List[str], + **preprocess_params) -> Dict[str, Any]: + if len(inputs) != 2: + raise ValueError( + 'modelscope error: Two input audio files are required.') + output = {} + for i in range(len(inputs)): + if isinstance(inputs[i], str): + file_bytes = File.read(inputs[i]) + data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') + if len(data.shape) == 2: + data = data[:, 0] + if fs != self.model_config['sample_rate']: + raise ValueError( + 'modelscope error: Only support %d sample rate files' + % self.model_cfg['sample_rate']) + output['data%d' % + (i + 1)] = torch.from_numpy(data).unsqueeze(0) + else: + raise ValueError( + 'modelscope error: The input type is temporarily restricted to audio file address' + % i) + return output + + def compute_cos_similarity(self, emb1: torch.Tensor, + emb2: torch.Tensor) -> float: + assert len(emb1.shape) == 2 and len(emb2.shape) == 2 + cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6) + cosine = cos(emb1, emb2) + return cosine.item() diff --git a/modelscope/pipelines/audio/speaker_verification_tdnn_pipeline.py b/modelscope/pipelines/audio/speaker_verification_tdnn_pipeline.py new file mode 100644 index 000000000..4c8a6f321 --- /dev/null +++ b/modelscope/pipelines/audio/speaker_verification_tdnn_pipeline.py @@ -0,0 +1,160 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import io +import os +from typing import Any, Dict, List, Union + +import numpy as np +import soundfile as sf +import torch +import torchaudio + +from modelscope.fileio import File +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import InputModel, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.speaker_verification, + module_name=Pipelines.speaker_verification_tdnn) +class SpeakerVerificationTDNNPipeline(Pipeline): + """Speaker Verification Inference Pipeline + use `model` to create a Speaker Verification pipeline. + + Args: + model (SpeakerVerificationPipeline): A model instance, or a model local dir, or a model id in the model hub. + kwargs (dict, `optional`): + Extra kwargs passed into the pipeline's constructor. + Example: + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> p = pipeline( + >>> task=Tasks.speaker_verification, model='damo/speech_ecapa-tdnn_sv_en_voxceleb_16k') + >>> print(p([audio_1, audio_2])) + + """ + + def __init__(self, model: InputModel, **kwargs): + """use `model` to create a speaker verification pipeline for prediction + Args: + model (str): a valid offical model id + """ + super().__init__(model=model, **kwargs) + self.model_config = self.model.model_config + self.config = self.model.other_config + self.thr = self.config['yesOrno_thr'] + self.save_dict = {} + + def __call__(self, + in_audios: Union[np.ndarray, list], + save_dir: str = None, + output_emb: bool = False, + thr: float = None): + if thr is not None: + self.thr = thr + if self.thr < -1 or self.thr > 1: + raise ValueError( + 'modelscope error: the thr value should be in [-1, 1], but found to be %f.' + % self.thr) + wavs = self.preprocess(in_audios) + embs = self.forward(wavs) + outputs = self.postprocess(embs, in_audios, save_dir) + if output_emb: + self.save_dict['outputs'] = outputs + self.save_dict['embs'] = embs.numpy() + return self.save_dict + else: + return outputs + + def forward(self, inputs: list): + embs = [] + for x in inputs: + embs.append(self.model(x)) + embs = torch.cat(embs) + return embs + + def postprocess(self, + inputs: torch.Tensor, + in_audios: Union[np.ndarray, list], + save_dir=None): + if isinstance(in_audios[0], str) and save_dir is not None: + # save the embeddings + os.makedirs(save_dir, exist_ok=True) + for i, p in enumerate(in_audios): + save_path = os.path.join( + save_dir, '%s.npy' % + (os.path.basename(p).rsplit('.', 1)[0])) + np.save(save_path, inputs[i].numpy()) + + if len(inputs) == 2: + # compute the score + score = self.compute_cos_similarity(inputs[0], inputs[1]) + score = round(score, 5) + if score >= self.thr: + ans = 'yes' + else: + ans = 'no' + output = {OutputKeys.SCORE: score, OutputKeys.TEXT: ans} + else: + output = {OutputKeys.TEXT: 'No similarity score output'} + + return output + + def preprocess(self, inputs: Union[np.ndarray, list]): + output = [] + for i in range(len(inputs)): + if isinstance(inputs[i], str): + file_bytes = File.read(inputs[i]) + data, fs = sf.read(io.BytesIO(file_bytes), dtype='float32') + if len(data.shape) == 2: + data = data[:, 0] + data = torch.from_numpy(data).unsqueeze(0) + if fs != self.model_config['sample_rate']: + logger.warning( + 'The sample rate of audio is not %d, resample it.' + % self.model_config['sample_rate']) + data, fs = torchaudio.sox_effects.apply_effects_tensor( + data, + fs, + effects=[[ + 'rate', + str(self.model_config['sample_rate']) + ]]) + data = data.squeeze(0) + elif isinstance(inputs[i], np.ndarray): + assert len( + inputs[i].shape + ) == 1, 'modelscope error: Input array should be [N, T]' + data = inputs[i] + if data.dtype in ['int16', 'int32', 'int64']: + data = (data / (1 << 15)).astype('float32') + else: + data = data.astype('float32') + data = torch.from_numpy(data) + else: + raise ValueError( + 'modelscope error: The input type is restricted to audio address and nump array.' + ) + output.append(data) + return output + + def compute_cos_similarity(self, emb1: Union[np.ndarray, torch.Tensor], + emb2: Union[np.ndarray, torch.Tensor]) -> float: + if isinstance(emb1, np.ndarray): + emb1 = torch.from_numpy(emb1) + if isinstance(emb2, np.ndarray): + emb2 = torch.from_numpy(emb2) + if len(emb1.shape): + emb1 = emb1.unsqueeze(0) + if len(emb2.shape): + emb2 = emb2.unsqueeze(0) + assert len(emb1.shape) == 2 and len(emb2.shape) == 2 + cos = torch.nn.CosineSimilarity(dim=1, eps=1e-6) + cosine = cos(emb1, emb2) + return cosine.item() diff --git a/modelscope/pipelines/audio/text_to_speech_pipeline.py b/modelscope/pipelines/audio/text_to_speech_pipeline.py index 4cfa9379e..17ce054f3 100644 --- a/modelscope/pipelines/audio/text_to_speech_pipeline.py +++ b/modelscope/pipelines/audio/text_to_speech_pipeline.py @@ -1,16 +1,15 @@ # Copyright (c) Alibaba, Inc. and its affiliates. -from typing import Any, Dict, List +from typing import Any, Dict import numpy as np from modelscope.metainfo import Pipelines -from modelscope.models import Model from modelscope.models.audio.tts import SambertHifigan from modelscope.outputs import OutputKeys from modelscope.pipelines.base import Input, InputModel, Pipeline from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.constant import Fields, Tasks +from modelscope.utils.constant import Tasks __all__ = ['TextToSpeechSambertHifiganPipeline'] diff --git a/modelscope/pipelines/audio/timestamp_pipeline.py b/modelscope/pipelines/audio/timestamp_pipeline.py deleted file mode 100644 index 98e9eb05f..000000000 --- a/modelscope/pipelines/audio/timestamp_pipeline.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict, List, Sequence, Tuple, Union - -import json -import yaml -from funasr.utils import asr_utils - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.audio.audio_utils import (generate_scp_from_url, - update_local_model) -from modelscope.utils.constant import Frameworks, ModelFile, Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['TimestampPipeline'] - - -@PIPELINES.register_module( - Tasks.speech_timestamp, module_name=Pipelines.speech_timestamp_inference) -class TimestampPipeline(Pipeline): - """Timestamp Inference Pipeline - Example: - - >>> from modelscope.pipelines import pipeline - >>> from modelscope.utils.constant import Tasks - - >>> pipeline_infer = pipeline( - >>> task=Tasks.speech_timestamp, - >>> model='damo/speech_timestamp_predictor-v1-16k-offline') - - >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav' - >>> text_in='一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢' - >>> print(pipeline_infer(audio_in, text_in)) - - """ - - def __init__(self, - model: Union[Model, str] = None, - ngpu: int = 1, - **kwargs): - """ - Use `model` and `preprocessor` to create an asr pipeline for prediction - Args: - model ('Model' or 'str'): - The pipeline handles three types of model: - - - A model instance - - A model local dir - - A model id in the model hub - output_dir('str'): - output dir path - batch_size('int'): - the batch size for inference - ngpu('int'): - the number of gpus, 0 indicates CPU mode - split_with_space('bool'): - split the input sentence by space - seg_dict_file('str'): - seg dict file - param_dict('dict'): - extra kwargs - """ - super().__init__(model=model, **kwargs) - config_path = os.path.join(model, ModelFile.CONFIGURATION) - self.cmd = self.get_cmd(config_path, kwargs, model) - - from funasr.bin import tp_inference_launch - self.funasr_infer_modelscope = tp_inference_launch.inference_launch( - mode=self.cmd['mode'], - batch_size=self.cmd['batch_size'], - dtype=self.cmd['dtype'], - ngpu=ngpu, - seed=self.cmd['seed'], - num_workers=self.cmd['num_workers'], - log_level=self.cmd['log_level'], - key_file=self.cmd['key_file'], - timestamp_infer_config=self.cmd['timestamp_infer_config'], - timestamp_model_file=self.cmd['timestamp_model_file'], - timestamp_cmvn_file=self.cmd['timestamp_cmvn_file'], - output_dir=self.cmd['output_dir'], - allow_variable_data_keys=self.cmd['allow_variable_data_keys'], - split_with_space=self.cmd['split_with_space'], - seg_dict_file=self.cmd['seg_dict_file'], - param_dict=self.cmd['param_dict'], - **kwargs, - ) - - def __call__(self, - audio_in: Union[str, bytes], - text_in: str, - audio_fs: int = None, - recog_type: str = None, - audio_format: str = None, - output_dir: str = None, - param_dict: dict = None, - **kwargs) -> Dict[str, Any]: - """ - Decoding the input audios - Args: - audio_in('str' or 'bytes'): - - A string containing a local path to a wav file - - A string containing a local path to a scp - - A string containing a wav url - text_in('str'): - - A text str input - - A local text file input endswith .txt or .scp - audio_fs('int'): - frequency of sample - recog_type('str'): - recog type for wav file or datasets file ('wav', 'test', 'dev', 'train') - audio_format('str'): - audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord') - output_dir('str'): - output dir - param_dict('dict'): - extra kwargs - Return: - A dictionary of result or a list of dictionary of result. - - The dictionary contain the following keys: - - **text** ('str') --The timestamp result. - """ - self.audio_in = None - self.text_in = None - self.raw_inputs = None - self.recog_type = recog_type - self.audio_format = audio_format - self.audio_fs = None - checking_audio_fs = None - if output_dir is not None: - self.cmd['output_dir'] = output_dir - if param_dict is not None: - self.cmd['param_dict'] = param_dict - - # audio - if isinstance(audio_in, str): - # for funasr code, generate wav.scp from url or local path - self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in) - elif isinstance(audio_in, bytes): - self.audio_in = audio_in - self.raw_inputs = None - else: - import numpy - import torch - if isinstance(audio_in, torch.Tensor): - self.audio_in = None - self.raw_inputs = audio_in - elif isinstance(audio_in, numpy.ndarray): - self.audio_in = None - self.raw_inputs = audio_in - # text - if text_in.startswith('http'): - self.text_in, _ = generate_text_from_url(text_in) - else: - self.text_in = text_in - - # set the sample_rate of audio_in if checking_audio_fs is valid - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs - - if recog_type is None or audio_format is None: - self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking( - audio_in=self.audio_in, - recog_type=recog_type, - audio_format=audio_format) - - if hasattr(asr_utils, - 'sample_rate_checking') and self.audio_in is not None: - checking_audio_fs = asr_utils.sample_rate_checking( - self.audio_in, self.audio_format) - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs - if audio_fs is not None: - self.cmd['fs']['audio_fs'] = audio_fs - else: - self.cmd['fs']['audio_fs'] = self.audio_fs - - output = self.forward(self.audio_in, self.text_in, **kwargs) - result = self.postprocess(output) - return result - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - """Postprocessing - """ - rst = {} - for i in range(len(inputs)): - if i == 0: - for key, value in inputs[0].items(): - if key == 'value': - if len(value) > 0: - rst[OutputKeys.TEXT] = value - elif key != 'key': - rst[key] = value - else: - rst[inputs[i]['key']] = inputs[i]['value'] - return rst - - def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]: - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - # generate inference command - timestamp_model_file = os.path.join( - model_dir, - model_cfg['model']['model_config']['timestamp_model_file']) - timestamp_infer_config = os.path.join( - model_dir, - model_cfg['model']['model_config']['timestamp_infer_config']) - timestamp_cmvn_file = os.path.join( - model_dir, - model_cfg['model']['model_config']['timestamp_cmvn_file']) - mode = model_cfg['model']['model_config']['mode'] - frontend_conf = None - if os.path.exists(timestamp_infer_config): - config_file = open(timestamp_infer_config, encoding='utf-8') - root = yaml.full_load(config_file) - config_file.close() - if 'frontend_conf' in root: - frontend_conf = root['frontend_conf'] - seg_dict_file = None - if 'seg_dict_file' in model_cfg['model']['model_config']: - seg_dict_file = os.path.join( - model_dir, model_cfg['model']['model_config']['seg_dict_file']) - update_local_model(model_cfg['model']['model_config'], model_path, - extra_args) - - cmd = { - 'mode': mode, - 'batch_size': 1, - 'dtype': 'float32', - 'ngpu': 0, # 0: only CPU, ngpu>=1: gpu number if cuda is available - 'seed': 0, - 'num_workers': 0, - 'log_level': 'ERROR', - 'key_file': None, - 'allow_variable_data_keys': False, - 'split_with_space': True, - 'seg_dict_file': seg_dict_file, - 'timestamp_infer_config': timestamp_infer_config, - 'timestamp_model_file': timestamp_model_file, - 'timestamp_cmvn_file': timestamp_cmvn_file, - 'output_dir': None, - 'param_dict': None, - 'fs': { - 'model_fs': None, - 'audio_fs': None - } - } - if frontend_conf is not None and 'fs' in frontend_conf: - cmd['fs']['model_fs'] = frontend_conf['fs'] - - user_args_dict = [ - 'output_dir', - 'batch_size', - 'mode', - 'ngpu', - 'param_dict', - 'num_workers', - 'log_level', - 'split_with_space', - 'seg_dict_file', - ] - - for user_args in user_args_dict: - if user_args in extra_args: - if extra_args.get(user_args) is not None: - cmd[user_args] = extra_args[user_args] - del extra_args[user_args] - - return cmd - - def forward(self, audio_in: Dict[str, Any], text_in: Dict[str, Any], - **kwargs) -> Dict[str, Any]: - """Decoding - """ - logger.info('Timestamp Processing ...') - # generate inputs - data_cmd: Sequence[Tuple[str, str, str]] - if isinstance(self.audio_in, bytes): - data_cmd = [(self.audio_in, 'speech', 'bytes')] - data_cmd.append((text_in, 'text', 'text')) - elif isinstance(self.audio_in, str): - data_cmd = [(self.audio_in, 'speech', 'sound')] - data_cmd.append((text_in, 'text', 'text')) - elif self.raw_inputs is not None: - data_cmd = None - - if self.raw_inputs is None and data_cmd is None: - raise ValueError('please check audio_in') - - self.cmd['name_and_type'] = data_cmd - self.cmd['raw_inputs'] = self.raw_inputs - self.cmd['audio_in'] = self.audio_in - - tp_result = self.run_inference(self.cmd, **kwargs) - - return tp_result - - def run_inference(self, cmd, **kwargs): - tp_result = [] - if self.framework == Frameworks.torch: - tp_result = self.funasr_infer_modelscope( - data_path_and_name_and_type=cmd['name_and_type'], - raw_inputs=cmd['raw_inputs'], - output_dir_v2=cmd['output_dir'], - fs=cmd['fs'], - param_dict=cmd['param_dict'], - **kwargs) - else: - raise ValueError('model type is mismatching') - - return tp_result diff --git a/modelscope/pipelines/audio/voice_activity_detection_pipeline.py b/modelscope/pipelines/audio/voice_activity_detection_pipeline.py deleted file mode 100644 index 3e00454a9..000000000 --- a/modelscope/pipelines/audio/voice_activity_detection_pipeline.py +++ /dev/null @@ -1,255 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. -import os -from typing import Any, Dict, List, Sequence, Tuple, Union - -import json -import yaml -from funasr.utils import asr_utils - -from modelscope.metainfo import Pipelines -from modelscope.models import Model -from modelscope.outputs import OutputKeys -from modelscope.pipelines.base import Pipeline -from modelscope.pipelines.builder import PIPELINES -from modelscope.utils.audio.audio_utils import (generate_scp_from_url, - update_local_model) -from modelscope.utils.constant import Frameworks, ModelFile, Tasks -from modelscope.utils.logger import get_logger - -logger = get_logger() - -__all__ = ['VoiceActivityDetectionPipeline'] - - -@PIPELINES.register_module( - Tasks.voice_activity_detection, module_name=Pipelines.vad_inference) -class VoiceActivityDetectionPipeline(Pipeline): - """Voice Activity Detection Inference Pipeline - use `model` to create a Voice Activity Detection pipeline. - - Args: - model: A model instance, or a model local dir, or a model id in the model hub. - kwargs (dict, `optional`): - Extra kwargs passed into the preprocessor's constructor. - - Example: - >>> from modelscope.pipelines import pipeline - >>> pipeline_vad = pipeline( - >>> task=Tasks.voice_activity_detection, model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch') - >>> audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.pcm' - >>> print(pipeline_vad(audio_in)) - - """ - - def __init__(self, - model: Union[Model, str] = None, - ngpu: int = 1, - **kwargs): - """use `model` to create an vad pipeline for prediction - """ - super().__init__(model=model, **kwargs) - config_path = os.path.join(model, ModelFile.CONFIGURATION) - self.cmd = self.get_cmd(config_path, kwargs, model) - - from funasr.bin import vad_inference_launch - self.funasr_infer_modelscope = vad_inference_launch.inference_launch( - mode=self.cmd['mode'], - batch_size=self.cmd['batch_size'], - dtype=self.cmd['dtype'], - ngpu=ngpu, - seed=self.cmd['seed'], - num_workers=self.cmd['num_workers'], - log_level=self.cmd['log_level'], - key_file=self.cmd['key_file'], - vad_infer_config=self.cmd['vad_infer_config'], - vad_model_file=self.cmd['vad_model_file'], - vad_cmvn_file=self.cmd['vad_cmvn_file'], - **kwargs, - ) - - def __call__(self, - audio_in: Union[str, bytes], - audio_fs: int = None, - recog_type: str = None, - audio_format: str = None, - output_dir: str = None, - param_dict: dict = None, - **kwargs) -> Dict[str, Any]: - """ - Decoding the input audios - Args: - audio_in('str' or 'bytes'): - - A string containing a local path to a wav file - - A string containing a local path to a scp - - A string containing a wav url - - A bytes input - audio_fs('int'): - frequency of sample - recog_type('str'): - recog type for wav file or datasets file ('wav', 'test', 'dev', 'train') - audio_format('str'): - audio format ('pcm', 'scp', 'kaldi_ark', 'tfrecord') - output_dir('str'): - output dir - param_dict('dict'): - extra kwargs - Return: - A dictionary of result or a list of dictionary of result. - - The dictionary contain the following keys: - - **text** ('str') --The vad result. - """ - self.audio_in = None - self.raw_inputs = None - self.recog_type = recog_type - self.audio_format = audio_format - self.audio_fs = None - checking_audio_fs = None - if output_dir is not None: - self.cmd['output_dir'] = output_dir - if param_dict is not None: - self.cmd['param_dict'] = param_dict - if isinstance(audio_in, str): - # for funasr code, generate wav.scp from url or local path - self.audio_in, self.raw_inputs = generate_scp_from_url(audio_in) - elif isinstance(audio_in, bytes): - self.audio_in = audio_in - self.raw_inputs = None - else: - import numpy - import torch - if isinstance(audio_in, torch.Tensor): - self.audio_in = None - self.raw_inputs = audio_in - elif isinstance(audio_in, numpy.ndarray): - self.audio_in = None - self.raw_inputs = audio_in - - # set the sample_rate of audio_in if checking_audio_fs is valid - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs - - if recog_type is None or audio_format is None: - self.recog_type, self.audio_format, self.audio_in = asr_utils.type_checking( - audio_in=self.audio_in, - recog_type=recog_type, - audio_format=audio_format) - - if hasattr(asr_utils, - 'sample_rate_checking') and self.audio_in is not None: - checking_audio_fs = asr_utils.sample_rate_checking( - self.audio_in, self.audio_format) - if checking_audio_fs is not None: - self.audio_fs = checking_audio_fs - if audio_fs is not None: - self.cmd['fs']['audio_fs'] = audio_fs - else: - self.cmd['fs']['audio_fs'] = self.audio_fs - - output = self.forward(self.audio_in, **kwargs) - result = self.postprocess(output) - return result - - def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: - """Postprocessing - """ - rst = {} - for i in range(len(inputs)): - if i == 0: - text = inputs[0]['value'] - if len(text) > 0: - rst[OutputKeys.TEXT] = text - else: - rst[inputs[i]['key']] = inputs[i]['value'] - return rst - - def get_cmd(self, config_path, extra_args, model_path) -> Dict[str, Any]: - model_cfg = json.loads(open(config_path).read()) - model_dir = os.path.dirname(config_path) - # generate inference command - vad_model_path = os.path.join( - model_dir, model_cfg['model']['model_config']['vad_model_name']) - vad_model_config = os.path.join( - model_dir, model_cfg['model']['model_config']['vad_model_config']) - vad_cmvn_file = os.path.join( - model_dir, model_cfg['model']['model_config']['vad_mvn_file']) - mode = model_cfg['model']['model_config']['mode'] - frontend_conf = None - if os.path.exists(vad_model_config): - config_file = open(vad_model_config, encoding='utf-8') - root = yaml.full_load(config_file) - config_file.close() - if 'frontend_conf' in root: - frontend_conf = root['frontend_conf'] - update_local_model(model_cfg['model']['model_config'], model_path, - extra_args) - - cmd = { - 'mode': mode, - 'batch_size': 1, - 'dtype': 'float32', - 'ngpu': 1, # 0: only CPU, ngpu>=1: gpu number if cuda is available - 'seed': 0, - 'num_workers': 0, - 'log_level': 'ERROR', - 'key_file': None, - 'vad_infer_config': vad_model_config, - 'vad_model_file': vad_model_path, - 'vad_cmvn_file': vad_cmvn_file, - 'output_dir': None, - 'param_dict': None, - 'fs': { - 'model_fs': None, - 'audio_fs': None - } - } - if frontend_conf is not None and 'fs' in frontend_conf: - cmd['fs']['model_fs'] = frontend_conf['fs'] - - user_args_dict = [ - 'output_dir', 'batch_size', 'mode', 'ngpu', 'param_dict', - 'num_workers', 'fs' - ] - - for user_args in user_args_dict: - if user_args in extra_args: - if extra_args.get(user_args) is not None: - cmd[user_args] = extra_args[user_args] - del extra_args[user_args] - - return cmd - - def forward(self, audio_in: Dict[str, Any], **kwargs) -> Dict[str, Any]: - """Decoding - """ - logger.info('VAD Processing ...') - # generate inputs - data_cmd: Sequence[Tuple[str, str, str]] - if isinstance(self.audio_in, bytes): - data_cmd = [self.audio_in, 'speech', 'bytes'] - elif isinstance(self.audio_in, str): - data_cmd = [self.audio_in, 'speech', 'sound'] - elif self.raw_inputs is not None: - data_cmd = None - self.cmd['name_and_type'] = data_cmd - self.cmd['raw_inputs'] = self.raw_inputs - self.cmd['audio_in'] = self.audio_in - - vad_result = self.run_inference(self.cmd, **kwargs) - - return vad_result - - def run_inference(self, cmd, **kwargs): - vad_result = [] - if self.framework == Frameworks.torch: - vad_result = self.funasr_infer_modelscope( - data_path_and_name_and_type=cmd['name_and_type'], - raw_inputs=cmd['raw_inputs'], - output_dir_v2=cmd['output_dir'], - fs=cmd['fs'], - param_dict=cmd['param_dict'], - **kwargs) - else: - raise ValueError('model type is mismatching') - - return vad_result diff --git a/modelscope/pipelines/base.py b/modelscope/pipelines/base.py index 4869e5c70..91c5b5543 100644 --- a/modelscope/pipelines/base.py +++ b/modelscope/pipelines/base.py @@ -44,7 +44,7 @@ class Pipeline(ABC): """Pipeline base. """ - def initiate_single_model(self, model): + def initiate_single_model(self, model, **kwargs): if isinstance(model, str): logger.info(f'initiate model from {model}') if isinstance(model, str) and is_official_hub_path(model): @@ -55,7 +55,8 @@ def initiate_single_model(self, model): device=self.device_name, model_prefetched=True, invoked_by=Invoke.PIPELINE, - device_map=self.device_map) if is_model(model) else model + device_map=self.device_map, + **kwargs) if is_model(model) else model else: return model @@ -96,7 +97,7 @@ def __init__(self, self.device_name = device if not isinstance(model, List): - self.model = self.initiate_single_model(model) + self.model = self.initiate_single_model(model, **kwargs) self.models = [self.model] else: self.model = None @@ -204,6 +205,13 @@ def __call__(self, input: Union[Input, List[Input]], *args, kwargs['preprocess_params'] = preprocess_params kwargs['forward_params'] = forward_params kwargs['postprocess_params'] = postprocess_params + + # for LLMPipeline, we shall support treating list of roles as a + # one single 'messages' input + if 'LLMPipeline' in type(self).__name__ and isinstance(input, list): + input = {'messages': input} + kwargs['is_message'] = True + if isinstance(input, list): if batch_size is None: output = [] @@ -396,7 +404,6 @@ def forward(self, inputs: Dict[str, Any], assert not self.has_multiple_models, 'default implementation does not support multiple models in a pipeline.' return self.model(inputs, **forward_params) - @abstractmethod def postprocess(self, inputs: Dict[str, Any], **post_params) -> Dict[str, Any]: """ If current pipeline support model reuse, common postprocess @@ -481,7 +488,10 @@ def __init__(self, def __del__(self): if hasattr(self, 'model_pool') and self.model_pool is not None: - self.model_pool.terminate() + try: + self.model_pool.terminate() + except AttributeError: + pass def __getstate__(self): self_dict = self.__dict__.copy() diff --git a/modelscope/pipelines/builder.py b/modelscope/pipelines/builder.py index f44f73811..665318073 100644 --- a/modelscope/pipelines/builder.py +++ b/modelscope/pipelines/builder.py @@ -1,13 +1,13 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os -from typing import List, Optional, Union +from typing import Any, Dict, List, Optional, Union from modelscope.hub.snapshot_download import snapshot_download from modelscope.metainfo import DEFAULT_MODEL_FOR_PIPELINE from modelscope.models.base import Model from modelscope.utils.config import ConfigDict, check_config -from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke, +from modelscope.utils.constant import (DEFAULT_MODEL_REVISION, Invoke, Tasks, ThirdParty) from modelscope.utils.hub import read_config from modelscope.utils.plugins import (register_modelhub_repo, @@ -108,18 +108,29 @@ def pipeline(task: str = None, """ if task is None and pipeline_name is None: raise ValueError('task or pipeline_name is required') - + prefer_llm_pipeline = kwargs.get('external_engine_for_llm') + if task is not None and task.lower() in [ + Tasks.text_generation, Tasks.chat + ]: + # if not specified, prefer llm pipeline for aforementioned tasks + if prefer_llm_pipeline is None: + prefer_llm_pipeline = True + # for llm pipeline, if llm_framework is not specified, default to swift instead + # TODO: port the swift infer based on transformer into ModelScope + if prefer_llm_pipeline and kwargs.get('llm_framework') is None: + kwargs['llm_framework'] = 'swift' third_party = kwargs.get(ThirdParty.KEY) if third_party is not None: kwargs.pop(ThirdParty.KEY) - model = normalize_model_input( - model, - model_revision, - third_party=third_party, - ignore_file_pattern=ignore_file_pattern) - if pipeline_name is None and kwargs.get('llm_first'): - pipeline_name = llm_first_checker(model, model_revision) - kwargs.pop('llm_first') + if pipeline_name is None and prefer_llm_pipeline: + pipeline_name = external_engine_for_llm_checker( + model, model_revision, kwargs) + else: + model = normalize_model_input( + model, + model_revision, + third_party=third_party, + ignore_file_pattern=ignore_file_pattern) pipeline_props = {'type': pipeline_name} if pipeline_name is None: # get default pipeline for this task @@ -131,10 +142,16 @@ def pipeline(task: str = None, model, revision=model_revision) if isinstance( model, str) else read_config( model[0], revision=model_revision) - check_config(cfg) register_plugins_repo(cfg.safe_get('plugins')) register_modelhub_repo(model, cfg.get('allow_remote', False)) - pipeline_props = cfg.pipeline + pipeline_name = external_engine_for_llm_checker( + model, model_revision, + kwargs) if prefer_llm_pipeline else None + if pipeline_name is not None: + pipeline_props = {'type': pipeline_name} + else: + check_config(cfg) + pipeline_props = cfg.pipeline elif model is not None: # get pipeline info from Model object first_model = model[0] if isinstance(model, list) else model @@ -153,6 +170,7 @@ def pipeline(task: str = None, pipeline_props['device'] = device cfg = ConfigDict(pipeline_props) + clear_llm_info(kwargs) if kwargs: cfg.update(kwargs) @@ -201,15 +219,27 @@ def get_default_pipeline_info(task): return pipeline_name, default_model -def llm_first_checker(model: Union[str, List[str], Model, List[Model]], - revision: Optional[str]) -> Optional[str]: - from .nlp.llm_pipeline import ModelTypeHelper, LLM_FORMAT_MAP +def external_engine_for_llm_checker(model: Union[str, List[str], Model, + List[Model]], + revision: Optional[str], + kwargs: Dict[str, Any]) -> Optional[str]: + from .nlp.llm_pipeline import ModelTypeHelper, LLMAdapterRegistry if isinstance(model, list): model = model[0] if not isinstance(model, str): model = model.model_dir + + if kwargs.get('llm_framework') == 'swift': + return 'llm' model_type = ModelTypeHelper.get( - model, revision, with_adapter=True, split='-') - if model_type in LLM_FORMAT_MAP: + model, revision, with_adapter=True, split='-', use_cache=True) + if LLMAdapterRegistry.contains(model_type): return 'llm' + + +def clear_llm_info(kwargs: Dict): + from modelscope.utils.model_type_helper import ModelTypeHelper + + kwargs.pop('external_engine_for_llm', None) + ModelTypeHelper.clear_cache() diff --git a/modelscope/pipelines/cv/__init__.py b/modelscope/pipelines/cv/__init__.py index 6fcd77eac..530c86a97 100644 --- a/modelscope/pipelines/cv/__init__.py +++ b/modelscope/pipelines/cv/__init__.py @@ -41,6 +41,7 @@ from .image_super_resolution_pasd_pipeline import ImageSuperResolutionPASDPipeline from .image_to_image_generate_pipeline import Image2ImageGenerationPipeline from .image_to_image_translation_pipeline import Image2ImageTranslationPipeline + from .image_inpainting_pipeline import ImageInpaintingPipeline from .image_paintbyexample_pipeline import ImagePaintbyexamplePipeline from .product_retrieval_embedding_pipeline import ProductRetrievalEmbeddingPipeline @@ -84,6 +85,7 @@ from .video_object_segmentation_pipeline import VideoObjectSegmentationPipeline from .video_deinterlace_pipeline import VideoDeinterlacePipeline from .image_matching_pipeline import ImageMatchingPipeline + from .image_matching_fast_pipeline import ImageMatchingFastPipeline from .video_stabilization_pipeline import VideoStabilizationPipeline from .video_super_resolution_pipeline import VideoSuperResolutionPipeline from .pointcloud_sceneflow_estimation_pipeline import PointCloudSceneFlowEstimationPipeline @@ -107,6 +109,7 @@ from .image_human_parsing_pipeline import ImageHumanParsingPipeline from .nerf_recon_acc_pipeline import NeRFReconAccPipeline from .nerf_recon_4k_pipeline import NeRFRecon4KPipeline + from .image_to_3d_pipeline import Image23DPipeline from .surface_recon_common_pipeline import SurfaceReconCommonPipeline from .controllable_image_generation_pipeline import ControllableImageGenerationPipeline from .image_bts_depth_estimation_pipeline import ImageBTSDepthEstimationPipeline @@ -115,6 +118,13 @@ from .text_to_360panorama_image_pipeline import Text2360PanoramaImagePipeline from .human3d_render_pipeline import Human3DRenderPipeline from .human3d_animation_pipeline import Human3DAnimationPipeline + from .image_local_feature_matching_pipeline import ImageLocalFeatureMatchingPipeline + from .rife_video_frame_interpolation_pipeline import RIFEVideoFrameInterpolationPipeline + from .anydoor_pipeline import AnydoorPipeline + from .image_depth_estimation_marigold_pipeline import ImageDepthEstimationMarigoldPipeline + from .self_supervised_depth_completion_pipeline import SelfSupervisedDepthCompletionPipeline + from .human_normal_estimation_pipeline import HumanNormalEstimationPipeline + else: _import_structure = { 'action_recognition_pipeline': ['ActionRecognitionPipeline'], @@ -163,6 +173,7 @@ ['ProductRetrievalEmbeddingPipeline'], 'live_category_pipeline': ['LiveCategoryPipeline'], 'image_to_image_generate_pipeline': ['Image2ImageGenerationPipeline'], + 'image_to_3d_pipeline': ['Image23DPipeline'], 'image_inpainting_pipeline': ['ImageInpaintingPipeline'], 'image_paintbyexample_pipeline': ['ImagePaintbyexamplePipeline'], 'ocr_detection_pipeline': ['OCRDetectionPipeline'], @@ -228,6 +239,7 @@ ], 'video_deinterlace_pipeline': ['VideoDeinterlacePipeline'], 'image_matching_pipeline': ['ImageMatchingPipeline'], + 'image_matching_fast_pipeline': ['ImageMatchingFastPipeline'], 'video_stabilization_pipeline': ['VideoStabilizationPipeline'], 'video_super_resolution_pipeline': ['VideoSuperResolutionPipeline'], 'pointcloud_sceneflow_estimation_pipeline': [ @@ -269,6 +281,7 @@ 'image_human_parsing_pipeline': ['ImageHumanParsingPipeline'], 'nerf_recon_acc_pipeline': ['NeRFReconAccPipeline'], 'nerf_recon_4k_pipeline': ['NeRFRecon4KPipeline'], + 'nerf_recon_img_to_mv_pipeline': ['NeRFReconImgToMVPipeline'], 'surface_recon_common_pipeline': ['SurfaceReconCommonPipeline'], 'controllable_image_generation_pipeline': [ 'ControllableImageGenerationPipeline' @@ -287,6 +300,20 @@ ], 'human3d_render_pipeline': ['Human3DRenderPipeline'], 'human3d_animation_pipeline': ['Human3DAnimationPipeline'], + 'image_local_feature_matching_pipeline': [ + 'ImageLocalFeatureMatchingPipeline' + ], + 'rife_video_frame_interpolation_pipeline': [ + 'RIFEVideoFrameInterpolationPipeline' + ], + 'anydoor_pipeline': ['AnydoorPipeline'], + 'image_depth_estimation_marigold_pipeline': [ + 'ImageDepthEstimationMarigoldPipeline' + ], + 'self_supervised_depth_completion_pipeline': [ + 'SelfSupervisedDepthCompletionPipeline' + ], + 'human_normal_estimation_pipeline': ['HumanNormalEstimationPipeline'], } import sys diff --git a/modelscope/pipelines/cv/anydoor_pipeline.py b/modelscope/pipelines/cv/anydoor_pipeline.py new file mode 100644 index 000000000..397cd21d7 --- /dev/null +++ b/modelscope/pipelines/cv/anydoor_pipeline.py @@ -0,0 +1,290 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from typing import Any, Dict + +import cv2 +import einops +import numpy as np +import requests +import torch +from PIL import Image + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.anydoor.cldm.ddim_hacked import DDIMSampler +from modelscope.models.cv.anydoor.datasets.data_utils import ( + box2squre, box_in_box, expand_bbox, expand_image_mask, get_bbox_from_mask, + pad_to_square, sobel) +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors.image import load_image +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_to_image_generation, module_name=Pipelines.anydoor) +class AnydoorPipeline(Pipeline): + r""" AnyDoor Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> from PIL import Image + + >>> ref_image = 'data/test/images/image_anydoor_fg.png' + >>> ref_mask = 'data/test/images/image_anydoor_fg_mask.png' + >>> bg_image = 'data/test/images/image_anydoor_bg.png' + >>> bg_mask = 'data/test/images/image_anydoor_bg_mask.png' + + >>> anydoor_pipeline = pipeline(Tasks.image_to_image_generation, model='damo/AnyDoor') + >>> out = anydoor_pipeline((ref_image, ref_mask, bg_image, bg_mask)) + >>> assert isinstance(out['output_img'], Image.Image) + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a action detection pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + model_ckpt = os.path.join(self.model.model_dir, + self.cfg.model.model_path) + self.model.load_state_dict( + self._get_state_dict(model_ckpt, location='cuda')) + self.ddim_sampler = DDIMSampler(self.model) + + @staticmethod + def _get_state_dict(ckpt_path, location='cpu'): + + def get_state_dict(d): + return d.get('state_dict', d) + + _, extension = os.path.splitext(ckpt_path) + if extension.lower() == '.safetensors': + import safetensors.torch + state_dict = safetensors.torch.load_file( + ckpt_path, device=location) + else: + state_dict = get_state_dict( + torch.load(ckpt_path, map_location=torch.device(location))) + state_dict = get_state_dict(state_dict) + print(f'Loaded state_dict from [{ckpt_path}]') + return state_dict + + def preprocess(self, inputs: Input) -> Dict[str, Any]: + ref_image, ref_mask, tar_image, tar_mask = inputs + ref_image = np.asarray(load_image(ref_image).convert('RGB')) + ref_mask = np.where( + np.asarray(load_image(ref_mask).convert('L')) > 128, 1, + 0).astype(np.uint8) + tar_image = np.asarray(load_image(tar_image).convert('RGB')) + tar_mask = np.where( + np.asarray(load_image(tar_mask).convert('L')) > 128, 1, + 0).astype(np.uint8) + + # ========= Reference =========== + # ref expand + ref_box_yyxx = get_bbox_from_mask(ref_mask) + + # ref filter mask + ref_mask_3 = np.stack([ref_mask, ref_mask, ref_mask], -1) + masked_ref_image = ref_image * ref_mask_3 + np.ones_like( + ref_image) * 255 * (1 - ref_mask_3) + + y1, y2, x1, x2 = ref_box_yyxx + masked_ref_image = masked_ref_image[y1:y2, x1:x2, :] + ref_mask = ref_mask[y1:y2, x1:x2] + + ratio = np.random.randint(11, 15) / 10 # 11,13 + masked_ref_image, ref_mask = expand_image_mask( + masked_ref_image, ref_mask, ratio=ratio) + ref_mask_3 = np.stack([ref_mask, ref_mask, ref_mask], -1) + + # to square and resize + masked_ref_image = pad_to_square( + masked_ref_image, pad_value=255, random=False) + masked_ref_image = cv2.resize( + masked_ref_image.astype(np.uint8), (224, 224)).astype(np.uint8) + + ref_mask_3 = pad_to_square(ref_mask_3 * 255, pad_value=0, random=False) + ref_mask_3 = cv2.resize(ref_mask_3.astype(np.uint8), + (224, 224)).astype(np.uint8) + ref_mask = ref_mask_3[:, :, 0] + + # collage aug + masked_ref_image_compose, ref_mask_compose = masked_ref_image, ref_mask + ref_mask_3 = np.stack( + [ref_mask_compose, ref_mask_compose, ref_mask_compose], -1) + ref_image_collage = sobel(masked_ref_image_compose, + ref_mask_compose / 255) + + # ========= Target =========== + tar_box_yyxx = get_bbox_from_mask(tar_mask) + tar_box_yyxx = expand_bbox( + tar_mask, tar_box_yyxx, ratio=[1.1, 1.2]) # 1.1 1.3 + + # crop + tar_box_yyxx_crop = expand_bbox( + tar_image, tar_box_yyxx, ratio=[1.3, 3.0]) + tar_box_yyxx_crop = box2squre(tar_image, tar_box_yyxx_crop) # crop box + y1, y2, x1, x2 = tar_box_yyxx_crop + + cropped_target_image = tar_image[y1:y2, x1:x2, :] + cropped_tar_mask = tar_mask[y1:y2, x1:x2] + + tar_box_yyxx = box_in_box(tar_box_yyxx, tar_box_yyxx_crop) + y1, y2, x1, x2 = tar_box_yyxx + + # collage + ref_image_collage = cv2.resize( + ref_image_collage.astype(np.uint8), (x2 - x1, y2 - y1)) + ref_mask_compose = cv2.resize( + ref_mask_compose.astype(np.uint8), (x2 - x1, y2 - y1)) + ref_mask_compose = (ref_mask_compose > 128).astype(np.uint8) + + collage = cropped_target_image.copy() + collage[y1:y2, x1:x2, :] = ref_image_collage + + collage_mask = cropped_target_image.copy() * 0.0 + collage_mask[y1:y2, x1:x2, :] = 1.0 + collage_mask = np.stack( + [cropped_tar_mask, cropped_tar_mask, cropped_tar_mask], -1) + + # the size before pad + H1, W1 = collage.shape[0], collage.shape[1] + + cropped_target_image = pad_to_square( + cropped_target_image, pad_value=0, random=False).astype(np.uint8) + collage = pad_to_square( + collage, pad_value=0, random=False).astype(np.uint8) + collage_mask = pad_to_square( + collage_mask, pad_value=0, random=False).astype(np.uint8) + + # the size after pad + H2, W2 = collage.shape[0], collage.shape[1] + + cropped_target_image = cv2.resize( + cropped_target_image.astype(np.uint8), + (512, 512)).astype(np.float32) + collage = cv2.resize(collage.astype(np.uint8), + (512, 512)).astype(np.float32) + collage_mask = (cv2.resize(collage_mask.astype( + np.uint8), (512, 512)).astype(np.float32) > 0.5).astype(np.float32) + + masked_ref_image = masked_ref_image / 255 + cropped_target_image = cropped_target_image / 127.5 - 1.0 + collage = collage / 127.5 - 1.0 + collage = np.concatenate([collage, collage_mask[:, :, :1]], -1) + + item = dict( + tar_image=tar_image, + ref=masked_ref_image.copy(), + jpg=cropped_target_image.copy(), + hint=collage.copy(), + extra_sizes=np.array([H1, W1, H2, W2]), + tar_box_yyxx_crop=np.array(tar_box_yyxx_crop)) + return item + + def forward(self, + item: Dict[str, Any], + num_samples=1, + strength=1.0, + ddim_steps=30, + scale=3.0) -> Dict[str, Any]: + tar_image = item['tar_image'].cpu().numpy() + ref = item['ref'] + hint = item['hint'] + num_samples = 1 + + control = hint.float().cuda() + control = torch.stack([control for _ in range(num_samples)], dim=0) + control = einops.rearrange(control, 'b h w c -> b c h w').clone() + + clip_input = ref.float().cuda() + clip_input = torch.stack([clip_input for _ in range(num_samples)], + dim=0) + clip_input = einops.rearrange(clip_input, 'b h w c -> b c h w').clone() + + H, W = 512, 512 + + cond = { + 'c_concat': [control], + 'c_crossattn': [self.model.get_learned_conditioning(clip_input)] + } + un_cond = { + 'c_concat': [control], + 'c_crossattn': [ + self.model.get_learned_conditioning( + [torch.zeros((1, 3, 224, 224))] * num_samples) + ] + } + shape = (4, H // 8, W // 8) + + self.model.control_scales = ([strength] * 13) + samples, _ = self.ddim_sampler.sample( + ddim_steps, + num_samples, + shape, + cond, + verbose=False, + eta=0, + unconditional_guidance_scale=scale, + unconditional_conditioning=un_cond) + + x_samples = self.model.decode_first_stage(samples) + x_samples = (einops.rearrange(x_samples, 'b c h w -> b h w c') * 127.5 + + 127.5).cpu().numpy() + + result = x_samples[0][:, :, ::-1] + result = np.clip(result, 0, 255) + + pred = x_samples[0] + pred = np.clip(pred, 0, 255)[1:, :, :] + sizes = item['extra_sizes'].cpu().numpy() + tar_box_yyxx_crop = item['tar_box_yyxx_crop'].cpu().numpy() + return dict( + pred=pred, + tar_image=tar_image, + sizes=sizes, + tar_box_yyxx_crop=tar_box_yyxx_crop) + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + pred = inputs['pred'] + tar_image = inputs['tar_image'] + extra_sizes = inputs['sizes'] + tar_box_yyxx_crop = inputs['tar_box_yyxx_crop'] + + H1, W1, H2, W2 = extra_sizes + y1, y2, x1, x2 = tar_box_yyxx_crop + pred = cv2.resize(pred, (W2, H2)) + m = 3 # maigin_pixel + + if W1 == H1: + tar_image[y1 + m:y2 - m, x1 + m:x2 - m, :] = pred[m:-m, m:-m] + gen_image = torch.from_numpy(tar_image.copy()).permute(2, 0, 1) + gen_image = gen_image.permute(1, 2, 0).numpy() + gen_image = Image.fromarray(gen_image, mode='RGB') + return {OutputKeys.OUTPUT_IMG: gen_image} + + if W1 < W2: + pad1 = int((W2 - W1) / 2) + pad2 = W2 - W1 - pad1 + pred = pred[:, pad1:-pad2, :] + else: + pad1 = int((H2 - H1) / 2) + pad2 = H2 - H1 - pad1 + pred = pred[pad1:-pad2, :, :] + + gen_image = tar_image.copy() + gen_image[y1 + m:y2 - m, x1 + m:x2 - m, :] = pred[m:-m, m:-m] + + gen_image = torch.from_numpy(gen_image).permute(2, 0, 1) + gen_image = gen_image.permute(1, 2, 0).numpy() + gen_image = Image.fromarray(gen_image, mode='RGB') + return {OutputKeys.OUTPUT_IMG: gen_image} diff --git a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py index af1e08fe8..c9e5036a1 100644 --- a/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py +++ b/modelscope/pipelines/cv/body_3d_keypoints_pipeline.py @@ -16,7 +16,7 @@ from matplotlib.ticker import MultipleLocator from modelscope.metainfo import Pipelines -from modelscope.models.cv.body_3d_keypoints.cannonical_pose.body_3d_pose import \ +from modelscope.models.cv.body_3d_keypoints.canonical_pose.body_3d_pose import \ KeypointsTypes from modelscope.outputs import OutputKeys from modelscope.pipelines import pipeline diff --git a/modelscope/pipelines/cv/dense_optical_flow_estimation_pipeline.py b/modelscope/pipelines/cv/dense_optical_flow_estimation_pipeline.py new file mode 100644 index 000000000..f734fd97c --- /dev/null +++ b/modelscope/pipelines/cv/dense_optical_flow_estimation_pipeline.py @@ -0,0 +1,147 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Union + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.cv.image_utils import InputPadder, flow_to_color +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.dense_optical_flow_estimation, + module_name=Pipelines.dense_optical_flow_estimation) +class DenseOpticalFlowEstimationPipeline(Pipeline): + r""" Card Detection Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + + >>> estimator = pipeline(Tasks.dense_optical_flow_estimation, model='Damo_XR_Lab/cv_raft_dense-optical-flow_things') + >>> estimator([[ + >>> 'modelscope/models/cv/dense_optical_flow_estimation/data/test/images/dense_flow1.png', + >>> 'modelscope/models/cv/dense_optical_flow_estimation/data/test/images/dense_flow2.png' + >>> ]]) + >>> [{'flows': tensor([[[[-1.6319, -1.6348, -1.6363, ..., -1.7191, -1.7136, -1.7085], + >>> [-1.6324, -1.6344, -1.6351, ..., -1.7110, -1.7048, -1.7005], + >>> [-1.6318, -1.6326, -1.6329, ..., -1.7080, -1.7050, -1.7031], + >>> ..., + >>> [-2.0998, -2.1007, -2.0958, ..., -1.4086, -1.4055, -1.3996], + >>> [-2.1043, -2.1031, -2.0988, ..., -1.4075, -1.4049, -1.3991], + >>> [-2.1016, -2.0985, -2.0939, ..., -1.4062, -1.4029, -1.3969]], + >>> + >>> [[ 0.0343, 0.0386, 0.0401, ..., 0.8053, 0.8050, 0.8057], + >>> [ 0.0311, 0.0354, 0.0369, ..., 0.8004, 0.8007, 0.8050], + >>> [ 0.0274, 0.0309, 0.0322, ..., 0.8007, 0.8016, 0.8080], + >>> ..., + >>> [ 0.5685, 0.5785, 0.5740, ..., 0.4003, 0.4153, 0.4365], + >>> [ 0.5994, 0.6000, 0.5899, ..., 0.4057, 0.4218, 0.4447], + >>> [ 0.6137, 0.6076, 0.5920, ..., 0.4147, 0.4299, 0.4538]]]], + >>> device='cuda:0'), 'flows_color': array([[[255, 249, 219], + >>> [255, 249, 219], + >>> [255, 249, 219], + >>> ..., + >>> [236, 255, 213], + >>> [236, 255, 213], + >>> [236, 255, 213]], + >>> + >>> [[255, 249, 219], + >>> [255, 249, 219], + >>> [255, 249, 219], + >>> ..., + >>> [236, 255, 213], + >>> [236, 255, 213], + >>> [236, 255, 213]], + >>> + >>> [[255, 249, 219], + >>> [255, 249, 219], + >>> [255, 249, 219], + >>> ..., + >>> [236, 255, 213], + >>> [236, 255, 213], + >>> [236, 255, 213]], + >>> + >>> ..., + >>> + >>> [[251, 255, 207], + >>> [251, 255, 207], + >>> [251, 255, 207], + >>> ..., + >>> [251, 255, 222], + >>> [251, 255, 222], + >>> [250, 255, 222]], + >>> + >>> [[250, 255, 207], + >>> [250, 255, 207], + >>> [250, 255, 207], + >>> ..., + >>> [251, 255, 222], + >>> [250, 255, 222], + >>> [249, 255, 222]], + >>> + >>> [[249, 255, 207], + >>> [249, 255, 207], + >>> [250, 255, 207], + >>> ..., + >>> [251, 255, 222], + >>> [250, 255, 222], + >>> [249, 255, 222]]], dtype=uint8)}] + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image depth estimation pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + logger.info('dense optical flow estimation model, pipeline init') + + def load_image(self, img_name): + img = LoadImage.convert_to_ndarray(img_name).astype(np.float32) + img = img.transpose(2, 0, 1) + + return img + + def preprocess(self, input: Input) -> Dict[str, Any]: + img1 = self.load_image(input[0]) + img2 = self.load_image(input[1]) + + image1 = torch.from_numpy(img1)[None].cuda().float() + image2 = torch.from_numpy(img2)[None].cuda().float() + + padder = InputPadder(image1.shape) + image1, image2 = padder.pad(image1, image2) + + data = {'image1': image1, 'image2': image2} + + return data + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + flow_ups = self.model.inference(input) + results = flow_ups[-1] + + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + out = self.model.postprocess(inputs) + flows_color = flow_to_color([out[OutputKeys.FLOWS]]) + flows_color = flows_color[:, :, [2, 1, 0]] + outputs = { + OutputKeys.FLOWS: out[OutputKeys.FLOWS], + OutputKeys.FLOWS_COLOR: flows_color + } + + return outputs diff --git a/modelscope/pipelines/cv/facial_68ldk_detection_pipeline.py b/modelscope/pipelines/cv/facial_68ldk_detection_pipeline.py new file mode 100644 index 000000000..5290af242 --- /dev/null +++ b/modelscope/pipelines/cv/facial_68ldk_detection_pipeline.py @@ -0,0 +1,88 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import argparse +import os +from typing import Any, Dict, Union + +import cv2 +import numpy as np +import torch + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.facial_68ldk_detection import infer +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.facial_68ldk_detection, module_name=Pipelines.facial_68ldk_detection) +class FaceLandmarkDetectionPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image depth prediction pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + parser = argparse.ArgumentParser(description='Evaluation script') + args = parser.parse_args() + args.config_name = 'alignment' + + device_ids = list() + if torch.cuda.is_available(): + device_ids = [0] + else: + device_ids = [-1] + + model_path = os.path.join(model, 'pytorch_model.pkl') + + self.fld = infer.Alignment( + args, model_path, dl_framework='pytorch', device_ids=device_ids) + + logger.info('Face 2d landmark detection model, pipeline init') + + def preprocess(self, input: Input) -> Dict[str, Any]: + print('start preprocess') + + image = LoadImage.convert_to_ndarray(input) + image = cv2.resize(image, (256, 256)) + + data = {'image': image} + + print('finish preprocess') + + return data + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + print('start infer') + + image = input['image'] + + if torch.cuda.is_available(): + image_np = image.cpu().numpy() + else: + image_np = image.numpy() + + x1, y1, x2, y2 = 0, 0, 256, 256 + scale = max(x2 - x1, y2 - y1) / 180 + center_w = (x1 + x2) / 2 + center_h = (y1 + y2) / 2 + scale, center_w, center_h = float(scale), float(center_w), float( + center_h) + + results = self.fld.analyze(image_np, scale, center_w, center_h) + + print('finish infer') + + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + outputs = {'landmarks': inputs} + return outputs diff --git a/modelscope/pipelines/cv/human3d_animation_pipeline.py b/modelscope/pipelines/cv/human3d_animation_pipeline.py index d03cd8a3e..4e5ab46db 100644 --- a/modelscope/pipelines/cv/human3d_animation_pipeline.py +++ b/modelscope/pipelines/cv/human3d_animation_pipeline.py @@ -72,7 +72,7 @@ def gen_weights(self, save_dir=None): (case_name, action_name)) exec_path = os.path.join(self.model_dir, 'skinning.py') - cmd = f'blender -b -P {exec_path} -- --input {self.case_dir}' \ + cmd = f'{self.blender} -b -P {exec_path} -- --input {self.case_dir}' \ f' --gltf_path {gltf_path} --action {self.action}' os.system(cmd) return gltf_path @@ -83,9 +83,6 @@ def animate(self, mesh_path, action_dir, action, save_dir=None): mesh = read_obj(mesh_path) tex = cv2.imread(tex_path) vertices = mesh['vertices'] - cent = (vertices.max(axis=0) + vertices.min(axis=0)) / 2 - new_cent = (0, 1.8 / 2, 0) - vertices -= (cent - new_cent) mesh['vertices'] = vertices mesh['texture_map'] = tex write_obj(mesh_path, mesh) @@ -108,6 +105,11 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: else: save_dir = None + if 'blender' in input: + self.blender = input['blender'] + else: + self.blender = 'blender' + if case_id.endswith('.obj'): mesh_path = case_id else: diff --git a/modelscope/pipelines/cv/human3d_render_pipeline.py b/modelscope/pipelines/cv/human3d_render_pipeline.py index 44d0bb21d..cf506d190 100644 --- a/modelscope/pipelines/cv/human3d_render_pipeline.py +++ b/modelscope/pipelines/cv/human3d_render_pipeline.py @@ -68,6 +68,8 @@ def load_3d_model(self, mesh_path): def format_nvdiffrast_format(self, mesh, tex): vert = mesh['vertices'] + cent = (vert.max(axis=0) + vert.min(axis=0)) / 2 + vert -= cent tri = mesh['faces'] tri = tri - 1 if tri.min() == 1 else tri vert_uv = mesh['uvs'] @@ -81,7 +83,7 @@ def format_nvdiffrast_format(self, mesh, tex): tex = torch.from_numpy(tex.astype(np.float32) / 255.0).cuda() return vtx_pos, pos_idx, vtx_uv, uv_idx, tex - def render_scene(self, mesh_path): + def render_scene(self, mesh_path, resolution=512): if not os.path.exists(mesh_path): logger.info('can not found %s, use default one' % mesh_path) mesh_path = os.path.join(self.model_dir, '3D-assets', @@ -99,8 +101,8 @@ def render_scene(self, mesh_path): frames_normals = [] for i in tqdm.tqdm(range(frame_length)): proj = projection(x=0.4, n=1.0, f=200.0) - a_rot = np.matmul(rotate_x(-0.1), rotate_y(ang)) - a_mv = np.matmul(translate(0, 0, -2.5), a_rot) + a_rot = np.matmul(rotate_x(0.0), rotate_y(ang)) + a_mv = np.matmul(translate(0, 0, -2.7), a_rot) r_mvp = np.matmul(proj, a_mv).astype(np.float32) pred_img, pred_mask, normal = render( glctx, @@ -110,7 +112,7 @@ def render_scene(self, mesh_path): vtx_uv, uv_idx, tex, - resolution=512, + resolution=resolution, enable_mip=False, max_mip_level=9) color = np.clip( @@ -123,7 +125,7 @@ def render_scene(self, mesh_path): frames_normals.append(normals) ang = ang + step - logger.info('load case %s done' + logger.info('render case %s done' % os.path.basename(os.path.dirname(mesh_path))) return mesh, frames_color, frames_normals @@ -131,6 +133,10 @@ def render_scene(self, mesh_path): def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: dataset_id = input['dataset_id'] case_id = input['case_id'] + if 'resolution' in input: + resolution = input['resolution'] + else: + resolution = 512 if case_id.endswith('.obj'): mesh_path = case_id else: @@ -142,7 +148,7 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: case_dir = os.path.join(data_dir, case_id) mesh_path = os.path.join(case_dir, 'body.obj') - mesh, colors, normals = self.render_scene(mesh_path) + mesh, colors, normals = self.render_scene(mesh_path, resolution) results = { 'mesh': mesh, diff --git a/modelscope/pipelines/cv/human_normal_estimation_pipeline.py b/modelscope/pipelines/cv/human_normal_estimation_pipeline.py new file mode 100644 index 000000000..bd19b18da --- /dev/null +++ b/modelscope/pipelines/cv/human_normal_estimation_pipeline.py @@ -0,0 +1,95 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict + +import numpy as np +from PIL import Image + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.human_normal_estimation, + module_name=Pipelines.human_normal_estimation) +class HumanNormalEstimationPipeline(Pipeline): + r""" Human Normal Estimation Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + + >>> estimator = pipeline( + >>> Tasks.human_normal_estimation, model='Damo_XR_Lab/cv_human_monocular-normal-estimation') + >>> estimator(f"{model_dir}/tests/image_normal_estimation.jpg") + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image normal estimation pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + logger.info('normal estimation model, pipeline init') + + def preprocess(self, input: Input) -> Dict[str, Any]: + """ + + Args: + input: string or ndarray or Image.Image + + Returns: + data: dict including inference inputs + """ + if isinstance(input, str): + img = np.array(Image.open(input)) + if isinstance(input, Image.Image): + img = np.array(input) + + img_h, img_w, img_ch = img.shape[0:3] + + if img_ch == 3: + msk = np.full((img_h, img_w, 1), 255, dtype=np.uint8) + img = np.concatenate((img, msk), axis=-1) + + H, W = 1024, 1024 + scale_factor = min(W / img_w, H / img_h) + img = Image.fromarray(img) + img = img.resize( + (int(img_w * scale_factor), int(img_h * scale_factor)), + Image.LANCZOS) + + new_img = Image.new('RGBA', (W, H), color=(0, 0, 0, 0)) + paste_pos_w = (W - img.width) // 2 + paste_pos_h = (H - img.height) // 2 + new_img.paste(img, (paste_pos_w, paste_pos_h)) + + bbox = (paste_pos_w, paste_pos_h, paste_pos_w + img.width, + paste_pos_h + img.height) + img = np.array(new_img) + + data = {'img': img[:, :, 0:3], 'msk': img[:, :, -1], 'bbox': bbox} + + return data + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.inference(input) + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.postprocess(inputs) + normals = results[OutputKeys.NORMALS] + + normals_vis = (((normals + 1) * 0.5) * 255).astype(np.uint8) + normals_vis = normals_vis[..., [2, 1, 0]] + outputs = { + OutputKeys.NORMALS: normals, + OutputKeys.NORMALS_COLOR: normals_vis + } + return outputs diff --git a/modelscope/pipelines/cv/image_depth_estimation_marigold_pipeline.py b/modelscope/pipelines/cv/image_depth_estimation_marigold_pipeline.py new file mode 100644 index 000000000..e5cdd7e7c --- /dev/null +++ b/modelscope/pipelines/cv/image_depth_estimation_marigold_pipeline.py @@ -0,0 +1,409 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. + +import os +from typing import Any, Dict, Union + +import numpy as np +import torch +from diffusers import (AutoencoderKL, DDIMScheduler, DiffusionPipeline, + UNet2DConditionModel) +from PIL import Image +from torch.utils.data import DataLoader, TensorDataset +from tqdm.auto import tqdm +from transformers import CLIPTextModel, CLIPTokenizer + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.image_depth_estimation_marigold import ( + MarigoldDepthOutput, chw2hwc, colorize_depth_maps, ensemble_depths, + find_batch_size, inter_distances, resize_max_res) +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_depth_estimation, + module_name=Pipelines.image_depth_estimation_marigold) +class ImageDepthEstimationMarigoldPipeline(Pipeline): + + def __init__(self, model=str, **kwargs): + r""" + use `model` to create a image depth estimation pipeline for prediction + Args: + >>> model: modelscope model_id "Damo_XR_Lab/cv_marigold_monocular-depth-estimation" + + Examples: + + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> from modelscope.outputs import OutputKeys + >>> + >>> output_image_path = './result.png' + >>> img = './test.jpg' + >>> + >>> pipe = pipeline( + >>> Tasks.image_depth_estimation, + >>> model='Damo_XR_Lab/cv_marigold_monocular-depth-estimation') + >>> + >>> depth_vis = pipe(input)[OutputKeys.DEPTHS_COLOR] + >>> depth_vis.save(output_image_path) + >>> print('pipeline: the output image path is {}'.format(output_image_path)) + + """ + super().__init__(model=model, **kwargs) + + self._device = getattr( + kwargs, 'device', + torch.device('cuda' if torch.cuda.is_available() else 'cpu')) + self._dtype = torch.float16 + logger.info('load depth estimation marigold pipeline done') + + self.checkpoint_path = os.path.join(model, 'Marigold_v1_merged_2') + self.pipeline = _MarigoldPipeline.from_pretrained( + self.checkpoint_path, torch_dtype=self._dtype) + self.pipeline.to(self._device) + + def preprocess(self, input: Input) -> Dict[str, Any]: + # print('pipeline preprocess') + # TODO: input type: Image + return input + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + self.input_image = Image.open(input) + # print('load', input, self.input_image.size) + + results = self.pipeline(self.input_image) + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + depths: np.ndarray = inputs.depth_np + depths_color: Image.Image = inputs.depth_colored + outputs = { + OutputKeys.DEPTHS: depths, + OutputKeys.DEPTHS_COLOR: depths_color + } + return outputs + + +class _MarigoldPipeline(DiffusionPipeline): + """ + Pipeline for monocular depth estimation using Marigold: https://marigoldmonodepth.github.io. + + This model inherits from [`DiffusionPipeline`]. + Check the superclass documentation for the generic methods the + library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.) + + Args: + unet (`UNet2DConditionModel`): + Conditional U-Net to denoise the depth latent, conditioned on image latent. + vae (`AutoencoderKL`): + Variational Auto-Encoder (VAE) Model to encode and decode images and depth maps + to and from latent representations. + scheduler (`DDIMScheduler`): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. + text_encoder (`CLIPTextModel`): + Text-encoder, for empty text embedding. + tokenizer (`CLIPTokenizer`): + CLIP tokenizer. + """ + rgb_latent_scale_factor = 0.18215 + depth_latent_scale_factor = 0.18215 + + def __init__( + self, + unet: UNet2DConditionModel, + vae: AutoencoderKL, + scheduler: DDIMScheduler, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + ): + super().__init__() + + self.register_modules( + unet=unet, + vae=vae, + scheduler=scheduler, + text_encoder=text_encoder, + tokenizer=tokenizer, + ) + + self.empty_text_embed = None + + @torch.no_grad() + def __call__( + self, + input_image: Image, + denoising_steps: int = 10, + ensemble_size: int = 10, + processing_res: int = 768, + match_input_res: bool = True, + batch_size: int = 0, + color_map: str = 'Spectral', + show_progress_bar: bool = True, + ensemble_kwargs: Dict = None, + ) -> MarigoldDepthOutput: + r""" + Function invoked when calling the pipeline. + + Args: + input_image (`Image`): + Input RGB (or gray-scale) image. + processing_res (`int`, *optional*, defaults to `768`): + Maximum resolution of processing. + If set to 0: will not resize at all. + match_input_res (`bool`, *optional*, defaults to `True`): + Resize depth prediction to match input resolution. + Only valid if `limit_input_res` is not None. + denoising_steps (`int`, *optional*, defaults to `10`): + Number of diffusion denoising steps (DDIM) during inference. + ensemble_size (`int`, *optional*, defaults to `10`): + Number of predictions to be ensembled. + batch_size (`int`, *optional*, defaults to `0`): + Inference batch size, no bigger than `num_ensemble`. + If set to 0, the script will automatically decide the proper batch size. + show_progress_bar (`bool`, *optional*, defaults to `True`): + Display a progress bar of diffusion denoising. + color_map (`str`, *optional*, defaults to `"Spectral"`): + Colormap used to colorize the depth map. + ensemble_kwargs (`dict`, *optional*, defaults to `None`): + Arguments for detailed ensembling settings. + Returns: + `MarigoldDepthOutput`: Output class for Marigold monocular depth prediction pipeline, including: + - **depth_np** (`np.ndarray`) Predicted depth map, with depth values in the range of [0, 1] + - **depth_colored** (`PIL.Image.Image`) Colorized depth map, with the shape of [3, H, W] + and values in [0, 1] + - **uncertainty** (`None` or `np.ndarray`) Uncalibrated uncertainty(MAD, median absolute deviation) + coming from ensembling. None if `ensemble_size = 1` + """ + + device = self.device + input_size = input_image.size + + if not match_input_res: + assert (processing_res is not None + ), 'Value error: `resize_output_back` is only valid with ' + assert processing_res >= 0 + assert denoising_steps >= 1 + assert ensemble_size >= 1 + + # ----------------- Image Preprocess ----------------- + # Resize image + if processing_res > 0: + input_image = resize_max_res( + input_image, max_edge_resolution=processing_res) + # Convert the image to RGB, to 1.remove the alpha channel 2.convert B&W to 3-channel + input_image = input_image.convert('RGB') + image = np.asarray(input_image) + + # Normalize rgb values + rgb = np.transpose(image, (2, 0, 1)) # [H, W, rgb] -> [rgb, H, W] + rgb_norm = rgb / 255.0 + rgb_norm = torch.from_numpy(rgb_norm).to(self.dtype) + rgb_norm = rgb_norm.to(device) + assert rgb_norm.min() >= 0.0 and rgb_norm.max() <= 1.0 + + # ----------------- Predicting depth ----------------- + # Batch repeated input image + duplicated_rgb = torch.stack([rgb_norm] * ensemble_size) + single_rgb_dataset = TensorDataset(duplicated_rgb) + if batch_size > 0: + _bs = batch_size + else: + _bs = find_batch_size( + ensemble_size=ensemble_size, + input_res=max(rgb_norm.shape[1:]), + dtype=self.dtype, + ) + + single_rgb_loader = DataLoader( + single_rgb_dataset, batch_size=_bs, shuffle=False) + + # Predict depth maps (batched) + depth_pred_ls = [] + if show_progress_bar: + iterable = tqdm( + single_rgb_loader, + desc=' ' * 2 + 'Inference batches', + leave=False) + else: + iterable = single_rgb_loader + for batch in iterable: + (batched_img, ) = batch + depth_pred_raw = self.single_infer( + rgb_in=batched_img, + num_inference_steps=denoising_steps, + show_pbar=show_progress_bar, + ) + depth_pred_ls.append(depth_pred_raw.detach().clone()) + depth_preds = torch.concat(depth_pred_ls, axis=0).squeeze() + torch.cuda.empty_cache() # clear vram cache for ensembling + + # ----------------- Test-time ensembling ----------------- + if ensemble_size > 1: + depth_pred, pred_uncert = ensemble_depths( + depth_preds, **(ensemble_kwargs or {})) + else: + depth_pred = depth_preds + pred_uncert = None + + # ----------------- Post processing ----------------- + # Scale prediction to [0, 1] + min_d = torch.min(depth_pred) + max_d = torch.max(depth_pred) + depth_pred = (depth_pred - min_d) / (max_d - min_d) + + # Convert to numpy + depth_pred = depth_pred.cpu().numpy().astype(np.float32) + + # Resize back to original resolution + if match_input_res: + pred_img = Image.fromarray(depth_pred) + pred_img = pred_img.resize(input_size) + depth_pred = np.asarray(pred_img) + + # Clip output range + depth_pred = depth_pred.clip(0, 1) + + # Colorize + depth_colored = colorize_depth_maps( + depth_pred, 0, 1, + cmap=color_map).squeeze() # [3, H, W], value in (0, 1) + depth_colored = (depth_colored * 255).astype(np.uint8) + depth_colored_hwc = chw2hwc(depth_colored) + depth_colored_img = Image.fromarray(depth_colored_hwc) + return MarigoldDepthOutput( + depth_np=depth_pred, + depth_colored=depth_colored_img, + uncertainty=pred_uncert, + ) + + def __encode_empty_text(self): + """ + Encode text embedding for empty prompt + """ + prompt = '' + text_inputs = self.tokenizer( + prompt, + padding='do_not_pad', + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors='pt', + ) + text_input_ids = text_inputs.input_ids.to(self.text_encoder.device) + self.empty_text_embed = self.text_encoder(text_input_ids)[0].to( + self.dtype) + + @torch.no_grad() + def single_infer(self, rgb_in: torch.Tensor, num_inference_steps: int, + show_pbar: bool) -> torch.Tensor: + r""" + Perform an individual depth prediction without ensembling. + + Args: + rgb_in (`torch.Tensor`): + Input RGB image. + num_inference_steps (`int`): + Number of diffusion denoisign steps (DDIM) during inference. + show_pbar (`bool`): + Display a progress bar of diffusion denoising. + Returns: + `torch.Tensor`: Predicted depth map. + """ + device = rgb_in.device + + # Set timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps = self.scheduler.timesteps # [T] + + # Encode image + rgb_latent = self.encode_rgb(rgb_in) + + # Initial depth map (noise) + depth_latent = torch.randn( + rgb_latent.shape, device=device, dtype=self.dtype) # [B, 4, h, w] + + # Batched empty text embedding + if self.empty_text_embed is None: + self.__encode_empty_text() + batch_empty_text_embed = self.empty_text_embed.repeat( + (rgb_latent.shape[0], 1, 1)) # [B, 2, 1024] + + # Denoising loop + if show_pbar: + iterable = tqdm( + enumerate(timesteps), + total=len(timesteps), + leave=False, + desc=' ' * 4 + 'Diffusion denoising', + ) + else: + iterable = enumerate(timesteps) + + for i, t in iterable: + unet_input = torch.cat([rgb_latent, depth_latent], + dim=1) # this order is important + + # predict the noise residual + noise_pred = self.unet( + unet_input, t, encoder_hidden_states=batch_empty_text_embed + ).sample # [B, 4, h, w] + + # compute the previous noisy sample x_t -> x_t-1 + depth_latent = self.scheduler.step(noise_pred, t, + depth_latent).prev_sample + torch.cuda.empty_cache() + depth = self.decode_depth(depth_latent) + + # clip prediction + depth = torch.clip(depth, -1.0, 1.0) + # shift to [0, 1] + depth = (depth + 1.0) / 2.0 + + return depth + + def encode_rgb(self, rgb_in: torch.Tensor) -> torch.Tensor: + """ + Encode RGB image into latent. + + Args: + rgb_in (`torch.Tensor`): + Input RGB image to be encoded. + + Returns: + `torch.Tensor`: Image latent. + """ + # encode + h = self.vae.encoder(rgb_in) + moments = self.vae.quant_conv(h) + mean, logvar = torch.chunk(moments, 2, dim=1) + # scale latent + rgb_latent = mean * self.rgb_latent_scale_factor + return rgb_latent + + def decode_depth(self, depth_latent: torch.Tensor) -> torch.Tensor: + """ + Decode depth latent into depth map. + + Args: + depth_latent (`torch.Tensor`): + Depth latent to be decoded. + + Returns: + `torch.Tensor`: Decoded depth map. + """ + # scale latent + depth_latent = depth_latent / self.depth_latent_scale_factor + # decode + z = self.vae.post_quant_conv(depth_latent) + stacked = self.vae.decoder(z) + # mean of output channels + depth_mean = stacked.mean(dim=1, keepdim=True) + return depth_mean + + def forward(self, x): + out = self.__call__(x) + return out diff --git a/modelscope/pipelines/cv/image_editing_pipeline.py b/modelscope/pipelines/cv/image_editing_pipeline.py index 15e21eafb..489fa422a 100644 --- a/modelscope/pipelines/cv/image_editing_pipeline.py +++ b/modelscope/pipelines/cv/image_editing_pipeline.py @@ -12,7 +12,7 @@ from modelscope.metainfo import Pipelines from modelscope.models.cv.image_editing import ( - MutualSelfAttentionControl, regiter_attention_editor_diffusers) + MutualSelfAttentionControl, register_attention_editor_diffusers) from modelscope.outputs import OutputKeys from modelscope.pipelines.builder import PIPELINES from modelscope.pipelines.multi_modal.diffusers_wrapped.diffusers_pipeline import \ @@ -97,7 +97,7 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: start_code = start_code.expand(len(prompts), -1, -1, -1) STEP, LAYER = 4, 10 editor = MutualSelfAttentionControl(STEP, LAYER) - regiter_attention_editor_diffusers(self.pipeline, editor) + register_attention_editor_diffusers(self.pipeline, editor) # inference the synthesized image output = self.pipeline( diff --git a/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py b/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py new file mode 100644 index 000000000..a49ca08d6 --- /dev/null +++ b/modelscope/pipelines/cv/image_local_feature_matching_pipeline.py @@ -0,0 +1,122 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Union + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_local_feature_matching, + module_name=Pipelines.image_local_feature_matching) +class ImageLocalFeatureMatchingPipeline(Pipeline): + r""" Image Local Feature Matching Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + + >>> matcher = pipeline(Tasks.image_local_feature_matching, + >>> model='Damo_XR_Lab/cv_resnet-transformer_local-feature-matching_outdoor-data') + >>> matcher([['https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching1.jpg', + >>> 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_matching2.jpg']]) + >>> [{ + >>> 'matches': [array([[720.5 , 187.8 ], + >>> [707.4 , 198.23334], + >>> ..., + >>> [746.7 , 594.7 ], + >>> [759.8 , 594.7 ]], dtype=float32), + >>> array([[652.49744 , 29.599142], + >>> [639.25287 , 45.90798 ], + >>> [653.041 , 43.399014], + >>> ..., + >>> [670.8787 , 547.8298 ], + >>> [608.5573 , 548.97815 ], + >>> [617.82574 , 548.601 ]], dtype=float32), + >>> array([0.25541496, 0.2781789 , 0.20776041, ..., 0.39656195, 0.7202848 , + >>> 0.37208357], dtype=float32)], + >>> 'output_img': array([[[255, 255, 255], + >>> [255, 255, 255], + >>> [255, 255, 255], + >>> ..., + >>> [255, 255, 255], + >>> [255, 255, 255], + >>> [255, 255, 255]], + >>> ..., + >>> [[255, 255, 255], + >>> [255, 255, 255], + >>> [255, 255, 255], + >>> ..., + >>> [255, 255, 255], + >>> [255, 255, 255], + >>> [255, 255, 255]]], dtype=uint8)}] + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image local feature matching pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + def load_image(self, img_name): + img = LoadImage.convert_to_ndarray(img_name).astype(np.float32) + img = img / 255. + # convert rgb to gray + if len(img.shape) == 3: + img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + H, W = 480, 640 + h_scale, w_scale = H / img.shape[0], W / img.shape[1] + img = cv2.resize(img, (W, H)) + return img, h_scale, w_scale + + def preprocess(self, input: Input): + assert len(input) == 2, 'input should be a list of two images' + + img1, h_scale1, w_scale1 = self.load_image(input[0]) + + img2, h_scale2, w_scale2 = self.load_image(input[1]) + + img1 = torch.from_numpy(img1)[None][None].cuda().float() + img2 = torch.from_numpy(img2)[None][None].cuda().float() + return { + 'image0': img1, + 'image1': img2, + 'scale_info': [h_scale1, w_scale1, h_scale2, w_scale2] + } + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.inference(input) + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.postprocess(inputs) + matches = results[OutputKeys.MATCHES] + + kpts0 = matches['kpts0'].cpu().numpy() + kpts1 = matches['kpts1'].cpu().numpy() + conf = matches['conf'].cpu().numpy() + scale_info = [v.cpu().numpy() for v in inputs['scale_info']] + kpts0[:, 0] = kpts0[:, 0] / scale_info[1] + kpts0[:, 1] = kpts0[:, 1] / scale_info[0] + kpts1[:, 0] = kpts1[:, 0] / scale_info[3] + kpts1[:, 1] = kpts1[:, 1] / scale_info[2] + + outputs = { + OutputKeys.MATCHES: [kpts0, kpts1, conf], + OutputKeys.OUTPUT_IMG: results[OutputKeys.OUTPUT_IMG] + } + + return outputs diff --git a/modelscope/pipelines/cv/image_matching_fast_pipeline.py b/modelscope/pipelines/cv/image_matching_fast_pipeline.py new file mode 100644 index 000000000..8af15f721 --- /dev/null +++ b/modelscope/pipelines/cv/image_matching_fast_pipeline.py @@ -0,0 +1,105 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, List, Union + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_matching, module_name=Pipelines.image_matching_fast) +class ImageMatchingFastPipeline(Pipeline): + """ Image Matching Pipeline. + + Examples: + + >>> from modelscope.outputs import OutputKeys + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + + >>> task = 'image-matching' + >>> model_id = 'Damo_XR_Lab/cv_transformer_image-matching_fast' + + >>> input_location = [[ + >>> 'data/test/images/image_matching1.jpg', + >>> 'data/test/images/image_matching1.jpg', + >>> ]] + >>> estimator = pipeline(task, model=model_id) + >>> result = estimator(input_location) + >>> kpts0, kpts1, confidence = result[0][OutputKeys.MATCHES] + >>> print(f'Found {len(kpts0)} matches') + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image matching pipeline fast for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + # check if cuda is available + if not torch.cuda.is_available(): + raise RuntimeError( + 'Cuda is not available. Image matching model only supports cuda.' + ) + + logger.info('image matching model, pipeline init') + + def load_image(self, img_name): + image_loader = LoadImage(backend='cv2') + img = image_loader(img_name)['img'] + return img + + def preprocess(self, input: Input): + assert len(input) == 2, 'input should be a list of two images' + img1 = self.load_image(input[0]) + img2 = self.load_image(input[1]) + + return {'image0': img1, 'image1': img2} + + def forward(self, input: Dict[str, Any]) -> list: + results = self.model.inference(input) + return results + + def postprocess(self, inputs: list) -> Dict[str, Any]: + results = self.model.postprocess(inputs) + matches = results[OutputKeys.MATCHES] + + kpts0 = matches['kpts0'].detach().cpu().numpy() + kpts1 = matches['kpts1'].detach().cpu().numpy() + confidence = matches['confidence'].detach().cpu().numpy() + + outputs = { + OutputKeys.MATCHES: [kpts0, kpts1, confidence], + } + + return outputs + + def __call__(self, input, **kwargs): + """ + Match two images and return the matched keypoints and confidence. + + Args: + input (`List[List[str]]`): A list of two image paths. + + Return: + A list of result. + The list contain the following values: + + - kpts0 -- Matched keypoints in the first image + - kpts1 -- Matched keypoints in the second image + - confidence -- Confidence of the match + """ + return super().__call__(input, **kwargs) diff --git a/modelscope/pipelines/cv/image_normal_estimation_pipeline.py b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py new file mode 100644 index 000000000..6622a6ee3 --- /dev/null +++ b/modelscope/pipelines/cv/image_normal_estimation_pipeline.py @@ -0,0 +1,154 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict, Union + +import cv2 +import numpy as np +import PIL +import torch + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Model, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.image_normal_estimation, + module_name=Pipelines.image_normal_estimation) +class ImageNormalEstimationPipeline(Pipeline): + r""" Image Normal Estimation Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + + >>> estimator = pipeline( + >>> Tasks.image_normal_estimation, model='Damo_XR_Lab/cv_omnidata_image-normal-estimation_normal') + >>> estimator("https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/image_normal_estimation.jpg") + >>> { + >>> "normals": array([[[0.09233217, 0.07563387, 0.08025375, ..., 0.06992684, + >>> 0.07490329, 0.14308228], + >>> [0.07833742, 0.06736029, 0.07296766, ..., 0.09184352, + >>> 0.0800755 , 0.09726034], + >>> [0.07676302, 0.06631223, 0.07067154, ..., 0.09527256, + >>> 0.09292313, 0.08056315], + >>> ..., + >>> [0.26432115, 0.29100573, 0.2956126 , ..., 0.2913087 , + >>> 0.29201347, 0.29539976], + >>> [0.24557455, 0.26430887, 0.28548756, ..., 0.2877307 , + >>> 0.28856137, 0.2937242 ], + >>> [0.26316068, 0.2718169 , 0.28436714, ..., 0.29435217, + >>> 0.29842147, 0.2943223 ]], + >>> [[0.59257126, 0.6459297 , 0.66572756, ..., 0.68350476, + >>> 0.6882835 , 0.66579086], + >>> [0.7054596 , 0.6592535 , 0.6728153 , ..., 0.6589912 , + >>> 0.64541686, 0.63954735], + >>> [0.6912665 , 0.6638877 , 0.67816293, ..., 0.6607329 , + >>> 0.6472897 , 0.64633334], + >>> ..., + >>> [0.04231769, 0.04427819, 0.04816979, ..., 0.04485315, + >>> 0.04652229, 0.04869233], + >>> [0.04601872, 0.03706329, 0.04397734, ..., 0.04522909, + >>> 0.04745695, 0.04823782], + >>> [0.06671816, 0.0520605 , 0.0563788 , ..., 0.04913886, + >>> 0.04974678, 0.04954173]], + >>> [[0.4338835 , 0.43240184, 0.43519282, ..., 0.36894026, + >>> 0.35207224, 0.33153164], + >>> [0.4786287 , 0.4399531 , 0.4350407 , ..., 0.34690523, + >>> 0.3179497 , 0.26544768], + >>> [0.47692937, 0.4416514 , 0.437603 , ..., 0.34660107, + >>> 0.3102659 , 0.27787644], + >>> ..., + >>> [0.49566334, 0.48355937, 0.48710674, ..., 0.4964854 , + >>> 0.48945957, 0.49413157], + >>> [0.490632 , 0.4706958 , 0.48100013, ..., 0.48724395, + >>> 0.4799561 , 0.48129278], + >>> [0.49428058, 0.47433382, 0.4823783 , ..., 0.48930234, + >>> 0.48616886, 0.47176325]]], dtype=float32), + >>> 'normals_color': array([[[ 23, 151, 110], + >>> [ 19, 164, 110], + >>> [ 20, 169, 110], + >>> ..., + >>> [ 17, 174, 94], + >>> [ 19, 175, 89], + >>> [ 36, 169, 84]], + >>> [[ 19, 179, 122], + >>> [ 17, 168, 112], + >>> [ 18, 171, 110], + >>> ..., + >>> [ 23, 168, 88], + >>> [ 20, 164, 81], + >>> [ 24, 163, 67]], + >>> [[ 19, 176, 121], + >>> [ 16, 169, 112], + >>> [ 18, 172, 111], + >>> ..., + >>> [ 24, 168, 88], + >>> [ 23, 165, 79], + >>> [ 20, 164, 70]], + >>> ..., + >>> [[ 67, 10, 126], + >>> [ 74, 11, 123], + >>> [ 75, 12, 124], + >>> ..., + >>> [ 74, 11, 126], + >>> [ 74, 11, 124], + >>> [ 75, 12, 126]], + >>> [[ 62, 11, 125], + >>> [ 67, 9, 120], + >>> [ 72, 11, 122], + >>> ..., + >>> [ 73, 11, 124], + >>> [ 73, 12, 122], + >>> [ 74, 12, 122]], + >>> [[ 67, 17, 126], + >>> [ 69, 13, 120], + >>> [ 72, 14, 123], + >>> ..., + >>> [ 75, 12, 124], + >>> [ 76, 12, 123], + >>> [ 75, 12, 120]]], dtype=uint8)} + """ + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image normal estimation pipeline for prediction + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model, **kwargs) + + logger.info('normal estimation model, pipeline init') + + def preprocess(self, input: Input) -> Dict[str, Any]: + img = LoadImage.convert_to_ndarray(input).astype(np.float32) + H, W = 384, 384 + img = cv2.resize(img, [W, H]) + img = img.transpose(2, 0, 1) / 255.0 + imgs = img[None, ...] + data = {'imgs': imgs} + + return data + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.inference(input) + return results + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + results = self.model.postprocess(inputs) + normals = results[OutputKeys.NORMALS] + if isinstance(normals, torch.Tensor): + normals = normals.detach().cpu().squeeze().numpy() + normals_color = (np.transpose(normals, + (1, 2, 0)) * 255).astype(np.uint8) + outputs = { + OutputKeys.NORMALS: normals, + OutputKeys.NORMALS_COLOR: normals_color + } + + return outputs diff --git a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py index 18883171d..72d65cae4 100644 --- a/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py +++ b/modelscope/pipelines/cv/image_portrait_enhancement_pipeline.py @@ -173,11 +173,13 @@ def sr_process(self, img): def preprocess(self, input: Input) -> Dict[str, Any]: img = LoadImage.convert_to_ndarray(input) - img_sr = img if self.use_sr: img_sr = self.sr_process(img) - img = cv2.resize(img, img_sr.shape[:2][::-1]) + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + else: + img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR) + img_sr = img.copy() result = {'img': img, 'img_sr': img_sr} return result @@ -200,6 +202,9 @@ def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: of, of_112, tfm_inv = warp_and_crop_face( img, facial5points, crop_size=(self.size, self.size)) + of = of[..., ::-1].copy() # BGR->RGB + of_112 = of_112[..., ::-1].copy() # BGR->RGB + # detect orig face quality fq_o, fea_o = self.eqface.get_face_quality(of_112) if fq_o < self.fqa_thres: diff --git a/modelscope/pipelines/cv/image_to_3d_pipeline.py b/modelscope/pipelines/cv/image_to_3d_pipeline.py new file mode 100644 index 000000000..d74003d6f --- /dev/null +++ b/modelscope/pipelines/cv/image_to_3d_pipeline.py @@ -0,0 +1,140 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import os.path as osp +from typing import Any, Dict + +import cv2 +import numpy as np +import PIL +import rembg +import torch +import torch.nn.functional as F +import torchvision.transforms as T +import torchvision.transforms.functional as TF +from omegaconf import OmegaConf +from PIL import Image +from torchvision.utils import save_image + +# import modelscope.models.cv.image_to_image_generation.data as data +# import modelscope.models.cv.image_to_image_generation.models as models +# import modelscope.models.cv.image_to_image_generation.ops as ops +from modelscope.metainfo import Pipelines +# from modelscope.models.cv.image_to_3d.ldm.models.diffusion.sync_dreamer import \ +# SyncMultiviewDiffusion +from modelscope.models.cv.image_to_3d.ldm.util import (add_margin, + instantiate_from_config) +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +# from modelscope.models.cv.image_to_3d.model import UNet +# from modelscope.models.cv.image_to_image_generation.models.clip import \ +# VisionTransformer + +logger = get_logger() + + +# Load Syncdreamer Model +def load_model(cfg, ckpt, strict=True): + config = OmegaConf.load(cfg) + model = instantiate_from_config(config.model) + print(f'loading model from {ckpt} ...') + ckpt = torch.load(ckpt, map_location='cpu') + model.load_state_dict(ckpt['state_dict'], strict=strict) + model = model.cuda().eval() + return model + + +# Prepare Syncdreamer Input +def prepare_inputs(image_input, elevation_input, crop_size=-1, image_size=256): + image_input[:, :, :3] = image_input[:, :, :3][:, :, ::-1] + image_input = Image.fromarray(image_input) + if crop_size != -1: + alpha_np = np.asarray(image_input)[:, :, 3] + coords = np.stack(np.nonzero(alpha_np), 1)[:, (1, 0)] + min_x, min_y = np.min(coords, 0) + max_x, max_y = np.max(coords, 0) + ref_img_ = image_input.crop((min_x, min_y, max_x, max_y)) + h, w = ref_img_.height, ref_img_.width + scale = crop_size / max(h, w) + h_, w_ = int(scale * h), int(scale * w) + ref_img_ = ref_img_.resize((w_, h_), resample=Image.BICUBIC) + image_input = add_margin(ref_img_, size=image_size) + else: + image_input = add_margin( + image_input, size=max(image_input.height, image_input.width)) + image_input = image_input.resize((image_size, image_size), + resample=Image.BICUBIC) + + image_input = np.asarray(image_input) + image_input = image_input.astype(np.float32) / 255.0 + ref_mask = image_input[:, :, 3:] + image_input[:, :, : + 3] = image_input[:, :, : + 3] * ref_mask + 1 - ref_mask # white background + image_input = image_input[:, :, :3] * 2.0 - 1.0 + image_input = torch.from_numpy(image_input.astype(np.float32)) + elevation_input = torch.from_numpy( + np.asarray([np.deg2rad(elevation_input)], np.float32)) + return {'input_image': image_input, 'input_elevation': elevation_input} + + +@PIPELINES.register_module( + Tasks.image_to_3d, module_name=Pipelines.image_to_3d) +class Image23DPipeline(Pipeline): + + def __init__(self, model: str, **kwargs): + """ + use `model` to create a image-to-3d generation pipeline + Args: + model: model id on modelscope hub. + """ + super().__init__(model=model) + config_path = osp.join(self.model, ModelFile.CONFIGURATION) + logger.info(f'loading config from {config_path}') + self.cfg = Config.from_file(config_path) + # print(config_path) + if torch.cuda.is_available(): + self._device = torch.device('cuda') + else: + self._device = torch.device('cpu') + ckpt = config_path.replace('configuration.json', + 'syncdreamer-pretrain.ckpt') + self.model = load_model( + config_path.replace('configuration.json', 'syncdreamer.yaml'), + ckpt).to(self._device) + # os.system("pip install -r {}".format(config_path.replace("configuration.json", "requirements.txt"))) + # assert isinstance(self.model, SyncMultiviewDiffusion) + + def preprocess(self, input: Input) -> Dict[str, Any]: + + result = rembg.remove(Image.open(input)) + print(type(result)) + img = np.array(result) + img[:, :, :3] = img[:, :, :3][:, :, ::-1] + # img = cv2.imread(input) + data = prepare_inputs( + img, elevation_input=10, crop_size=200, image_size=256) + + for k, v in data.items(): + data[k] = v.unsqueeze(0).cuda() + data[k] = torch.repeat_interleave( + data[k], 1, dim=0) # only one sample + return data + + @torch.no_grad() + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + x_sample = self.model.sample(input, 2.0, 8) + + B, N, _, H, W = x_sample.shape + x_sample = (torch.clamp(x_sample, max=1.0, min=-1.0) + 1) * 0.5 + x_sample = x_sample.permute(0, 1, 3, 4, 2).cpu().numpy() * 255 + x_sample = x_sample.astype(np.uint8) + show_in_im2 = [Image.fromarray(x_sample[0, ni]) for ni in range(N)] + return {'MViews': show_in_im2} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + return inputs diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py index cb7522c0d..5b0fbda5a 100644 --- a/modelscope/pipelines/cv/ocr_detection_pipeline.py +++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py @@ -6,7 +6,6 @@ import cv2 import numpy as np -import tensorflow as tf import torch from modelscope.metainfo import Pipelines @@ -19,18 +18,7 @@ from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.device import device_placement from modelscope.utils.logger import get_logger -from .ocr_utils import (SegLinkDetector, boxes_from_bitmap, cal_width, - combine_segments_python, decode_segments_links_python, - nms_python, polygons_from_bitmap, rboxes_to_polygons) - -if tf.__version__ >= '2.0': - import tf_slim as slim -else: - from tensorflow.contrib import slim - -if tf.__version__ >= '2.0': - tf = tf.compat.v1 -tf.compat.v1.disable_eager_execution() +from .ocr_utils import cal_width, nms_python, rboxes_to_polygons logger = get_logger() @@ -39,12 +27,8 @@ OFFSET_DIM = 6 WORD_POLYGON_DIM = 8 OFFSET_VARIANCE = [0.1, 0.1, 0.1, 0.1, 0.1, 0.1] - -FLAGS = tf.app.flags.FLAGS -tf.app.flags.DEFINE_float('node_threshold', 0.4, - 'Confidence threshold for nodes') -tf.app.flags.DEFINE_float('link_threshold', 0.6, - 'Confidence threshold for links') +TF_NODE_THRESHOLD = 0.4 +TF_LINK_THRESHOLD = 0.6 @PIPELINES.register_module( @@ -57,7 +41,7 @@ class OCRDetectionPipeline(Pipeline): ```python >>> from modelscope.pipelines import pipeline - >>> ocr_detection = pipeline('ocr_detection', model='damo/cv_resnet18_ocr-detection-line-level_damo') + >>> ocr_detection = pipeline('ocr-detection', model='damo/cv_resnet18_ocr-detection-line-level_damo') >>> result = ocr_detection('https://modelscope.oss-cn-beijing.aliyuncs.com/test/images/ocr_detection.jpg') {'polygons': array([[220, 14, 780, 14, 780, 64, 220, 64], @@ -99,6 +83,16 @@ def __init__(self, model: str, **kwargs): logger.info('loading model done') else: # for model seglink++ + import tensorflow as tf + + if tf.__version__ >= '2.0': + tf = tf.compat.v1 + tf.compat.v1.disable_eager_execution() + + tf.app.flags.DEFINE_float('node_threshold', TF_NODE_THRESHOLD, + 'Confidence threshold for nodes') + tf.app.flags.DEFINE_float('link_threshold', TF_LINK_THRESHOLD, + 'Confidence threshold for links') tf.reset_default_graph() model_path = osp.join( osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER), @@ -125,6 +119,7 @@ def __init__(self, model: str, **kwargs): variable_averages = tf.train.ExponentialMovingAverage( 0.997, global_step) + from .ocr_utils import SegLinkDetector, combine_segments_python, decode_segments_links_python # detector detector = SegLinkDetector() all_maps = detector.build_model( @@ -198,6 +193,7 @@ def preprocess(self, input: Input) -> Dict[str, Any]: result = self.preprocessor(input) return result else: + # for model seglink++ img = LoadImage.convert_to_ndarray(input) h, w, c = img.shape diff --git a/modelscope/pipelines/cv/rife_video_frame_interpolation_pipeline.py b/modelscope/pipelines/cv/rife_video_frame_interpolation_pipeline.py new file mode 100644 index 000000000..a4892273e --- /dev/null +++ b/modelscope/pipelines/cv/rife_video_frame_interpolation_pipeline.py @@ -0,0 +1,127 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import glob +import math +import os +import os.path as osp +import subprocess +import tempfile +from typing import Any, Dict, Optional, Union + +import cv2 +import numpy as np +import torch +import torch.nn.functional as F +from torchvision.utils import make_grid + +from modelscope.metainfo import Pipelines +from modelscope.models.cv.video_frame_interpolation.rife import RIFEModel +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input, Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.preprocessors import LoadImage +from modelscope.preprocessors.cv import VideoReader +from modelscope.utils.config import Config +from modelscope.utils.constant import ModelFile, Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.video_frame_interpolation, + module_name=Pipelines.rife_video_frame_interpolation) +class RIFEVideoFrameInterpolationPipeline(Pipeline): + r""" RIFE Video Frame Interpolation Pipeline. + + Examples: + + >>> from modelscope.pipelines import pipeline + >>> from modelscope.utils.constant import Tasks + >>> from modelscope.outputs import OutputKeys + + >>> video = 'https://modelscope.oss-cn-beijing.aliyuncs.com/test/videos/video_frame_interpolation_test.mp4' + >>> video_frame_interpolation_pipeline = pipeline(Tasks.video_frame_interpolation, + 'Damo_XR_Lab/cv_rife_video-frame-interpolation') + >>> result = video_frame_interpolation_pipeline(video)[OutputKeys.OUTPUT_VIDEO] + >>> print('pipeline: the output video path is {}'.format(result)) + + """ + + def __init__(self, + model: Union[RIFEModel, str], + preprocessor=None, + **kwargs): + super().__init__(model=model, preprocessor=preprocessor, **kwargs) + if (isinstance(model, str)): + self.model = RIFEModel(model) + logger.info('load video frame-interpolation done') + + def preprocess(self, input: Input, out_fps: float = 0) -> Dict[str, Any]: + # Determine the input type + if isinstance(input, str): + video_reader = VideoReader(input) + elif isinstance(input, dict): + video_reader = VideoReader(input['video']) + inputs = [] + for frame in video_reader: + inputs.append(frame) + fps = video_reader.fps + + for i, img in enumerate(inputs): + img = torch.from_numpy(img.copy()).permute(2, 0, 1).float() + inputs[i] = img.unsqueeze(0).to(self.model.device) + + out_fps = 2 * fps + return {'video': inputs, 'fps': fps, 'out_fps': out_fps} + + def forward(self, input: Dict[str, Any]) -> Dict[str, Any]: + inputs = input['video'] + # fps = input['fps'] + out_fps = input['out_fps'] + video_len = len(inputs) + + h, w = inputs[0].shape[-2:] + ph = ((h - 1) // 32 + 1) * 32 + pw = ((w - 1) // 32 + 1) * 32 + padding = (0, pw - w, 0, ph - h) + + outputs = [] + for i in range(video_len): + if i == 0: + outputs.append(inputs[i]) + elif i == video_len - 1: + outputs.append(inputs[i]) + else: + i0 = F.pad(inputs[i - 1] / 255., padding).to(self.model.device) + i1 = F.pad(inputs[i] / 255., padding).to(self.model.device) + output = self.model.inference(i0, i1)[:, :, :h, :w] + output = output.cpu() * 255 + torch.cuda.empty_cache() + outputs.append(output) + outputs.append(inputs[i]) + return {'output': outputs, 'fps': out_fps} + + def postprocess(self, inputs: Dict[str, Any], **kwargs) -> Dict[str, Any]: + output_video_path = kwargs.get('output_video', None) + demo_service = kwargs.get('demo_service', True) + if output_video_path is None: + output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name + h, w = inputs['output'][0].shape[-2:] + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + video_writer = cv2.VideoWriter(output_video_path, fourcc, + inputs['fps'], (w, h)) + for i in range(len(inputs['output'])): + img = inputs['output'][i] + img = img[0].permute(1, 2, 0).byte().cpu().numpy() + video_writer.write(img.astype(np.uint8)) + + video_writer.release() + if demo_service: + assert os.system( + 'ffmpeg -version') == 0, 'ffmpeg is not installed correctly!' + output_video_path_for_web = output_video_path[:-4] + '_web.mp4' + convert_cmd = f'ffmpeg -i {output_video_path} -vcodec h264 -crf 5 {output_video_path_for_web}' + subprocess.call(convert_cmd, shell=True) + return {OutputKeys.OUTPUT_VIDEO: output_video_path_for_web} + else: + return {OutputKeys.OUTPUT_VIDEO: output_video_path} diff --git a/modelscope/pipelines/cv/self_supervised_depth_completion_pipeline.py b/modelscope/pipelines/cv/self_supervised_depth_completion_pipeline.py new file mode 100644 index 000000000..3f16d8ff0 --- /dev/null +++ b/modelscope/pipelines/cv/self_supervised_depth_completion_pipeline.py @@ -0,0 +1,59 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +from typing import Any, Dict + +from modelscope.metainfo import Pipelines +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger() + + +@PIPELINES.register_module( + Tasks.self_supervised_depth_completion, + module_name=Pipelines.self_supervised_depth_completion) +class SelfSupervisedDepthCompletionPipeline(Pipeline): + """Self Supervise dDepth Completion Pipeline + Example: + + ```python + >>> from modelscope.pipelines import pipeline + >>> model_id = 'Damo_XR_Lab/Self_Supervised_Depth_Completion' + >>> data_dir = MsDataset.load( + 'KITTI_Depth_Dataset', + namespace='Damo_XR_Lab', + split='test', + download_mode=DownloadMode.FORCE_REDOWNLOAD + ).config_kwargs['split_config']['test'] + >>> source_dir = os.path.join(data_dir, 'selected_data') + >>> self_supervised_depth_completion = pipeline(Tasks.self_supervised_depth_completion, + 'Damo_XR_Lab/Self_Supervised_Depth_Completion') + >>> result = self_supervised_depth_completion({ + 'model_dir': model_id + 'source_dir': source_dir + }) + cv2.imwrite('result.jpg', result[OutputKeys.OUTPUT]) + >>> # + ``` + """ + + def __init__(self, model: str, **kwargs): + + super().__init__(model=model, **kwargs) + logger.info('load model done') + + def preprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + """preprocess, not used at present""" + return inputs + + def forward(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + """forward""" + source_dir = inputs['source_dir'] + result = self.model.forward(source_dir) + return {OutputKeys.OUTPUT: result} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + """postprocess, not used at present""" + return inputs diff --git a/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py b/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py index 1b791634e..320d83e7e 100644 --- a/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py +++ b/modelscope/pipelines/multi_modal/efficient_diffusion_tuning_pipeline.py @@ -36,8 +36,10 @@ def __init__(self, model: str, **kwargs): 'data/test/images/vision_efficient_tuning_test_1.png') >>> print(f'Output: {result}.') """ + logger.warn( + '[NOTE]Do not use this pipeline because the dependencies are too old, ' + 'use https://github.com/modelscope/DiffSynth-Studio instead') super().__init__(model=model, **kwargs) - self.device = 'cuda' if torch.cuda.is_available() else 'cpu' self.model = self.model.to(self.device) self.model.eval() diff --git a/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py new file mode 100644 index 000000000..f19eddffe --- /dev/null +++ b/modelscope/pipelines/multi_modal/ovis_vl_pipeline.py @@ -0,0 +1,103 @@ +from typing import Any, Dict, Union + +import torch + +from modelscope import AutoModelForCausalLM +from modelscope.metainfo import Pipelines, Preprocessors +from modelscope.models.base import Model +from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Pipeline +from modelscope.pipelines.builder import PIPELINES +from modelscope.pipelines.multi_modal.visual_question_answering_pipeline import \ + VisualQuestionAnsweringPipeline +from modelscope.preprocessors import Preprocessor, load_image +from modelscope.utils.constant import Fields, Frameworks, Tasks + + +@PIPELINES.register_module( + Tasks.visual_question_answering, module_name='ovis-vl') +class VisionChatPipeline(VisualQuestionAnsweringPipeline): + + def __init__(self, + model: Union[Model, str], + preprocessor: Preprocessor = None, + config_file: str = None, + device: str = 'gpu', + auto_collate=True, + **kwargs): + # super().__init__ + self.device_name = device + self.framework = Frameworks.torch + self._model_prepare = True + self._auto_collate = auto_collate + + # ovis + torch_dtype = kwargs.get('torch_dtype', torch.float16) + multimodal_max_length = kwargs.get('multimodal_max_length', 8192) + self.device = 'cuda' if device == 'gpu' else device + self.model = AutoModelForCausalLM.from_pretrained( + model, + torch_dtype=torch_dtype, + multimodal_max_length=multimodal_max_length, + trust_remote_code=True).to(self.device) + self.text_tokenizer = self.model.get_text_tokenizer() + self.visual_tokenizer = self.model.get_visual_tokenizer() + + def preprocess(self, inputs: Dict[str, Any]): + text = inputs['text'] + image_path_or_url = inputs['image'] + image = load_image(image_path_or_url) + query = f'\n{text}' + _, input_ids, pixel_values = self.model.preprocess_inputs( + query, [image]) + attention_mask = torch.ne(input_ids, self.text_tokenizer.pad_token_id) + input_ids = input_ids.unsqueeze(0).to(device=self.model.device) + attention_mask = attention_mask.unsqueeze(0).to( + device=self.model.device) + pixel_values = [ + pixel_values.to( + dtype=self.visual_tokenizer.dtype, + device=self.visual_tokenizer.device) + ] + + return { + 'input_ids': input_ids, + 'pixel_values': pixel_values, + 'attention_mask': attention_mask + } + + def forward(self, inputs: Dict[str, Any], + **forward_params) -> Dict[str, Any]: + input_ids = inputs['input_ids'] + pixel_values = inputs['pixel_values'] + attention_mask = inputs['attention_mask'] + + max_new_tokens = forward_params.get('max_new_tokens', 1024) + do_sample = forward_params.get('do_sample', False) + top_p = forward_params.get('top_p', None) + top_k = forward_params.get('top_k', None) + temperature = forward_params.get('temperature', None) + repetition_penalty = forward_params.get('repetition_penalty', None) + with torch.inference_mode(): + gen_kwargs = dict( + max_new_tokens=max_new_tokens, + do_sample=do_sample, + top_p=top_p, + top_k=top_k, + temperature=temperature, + repetition_penalty=repetition_penalty, + eos_token_id=self.model.generation_config.eos_token_id, + pad_token_id=self.text_tokenizer.pad_token_id, + use_cache=True) + output_ids = self.model.generate( + input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + **gen_kwargs)[0] + return {'output_ids': output_ids} + + def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: + output_ids = inputs['output_ids'] + output = self.text_tokenizer.decode( + output_ids, skip_special_tokens=True) + return {OutputKeys.TEXT: output} diff --git a/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py b/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py index f3ff7ccea..59320577d 100644 --- a/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py +++ b/modelscope/pipelines/multi_modal/text_to_video_synthesis_pipeline.py @@ -6,6 +6,7 @@ import cv2 import torch +import torchvision from einops import rearrange from modelscope.metainfo import Pipelines @@ -75,14 +76,17 @@ def postprocess(self, inputs: Dict[str, Any], output_video_path = tempfile.NamedTemporaryFile(suffix='.mp4').name temp_video_file = True - fourcc = cv2.VideoWriter_fourcc(*'mp4v') - h, w, c = video[0].shape - video_writer = cv2.VideoWriter( - output_video_path, fourcc, fps=8, frameSize=(w, h)) - for i in range(len(video)): - img = cv2.cvtColor(video[i], cv2.COLOR_RGB2BGR) - video_writer.write(img) - video_writer.release() + # Ensure video is a list of frames with shape (h, w, c) + frames = [torch.from_numpy(frame) for frame in video] + # Stack frames along a new dimension to create a 4D tensor (T, H, W, C) + imgs_tensor = torch.stack(frames, dim=0) + + torchvision.io.write_video( + output_video_path, + imgs_tensor, + fps=8, + video_codec='h264', + options={'crf': '10'}) if temp_video_file: video_file_content = b'' with open(output_video_path, 'rb') as f: diff --git a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py index 0b2ba1996..3205f8b5f 100644 --- a/modelscope/pipelines/nlp/faq_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/faq_question_answering_pipeline.py @@ -50,6 +50,9 @@ def _sanitize_parameters(self, **pipeline_parameters): return pipeline_parameters, pipeline_parameters, pipeline_parameters def get_sentence_embedding(self, inputs, max_len=None): + if (self.model or (self.has_multiple_models and self.models[0])): + if not self._model_prepare: + self.prepare_model() inputs = self.preprocessor.batch_encode(inputs, max_length=max_len) sentence_vecs = self.model.forward_sentence_embedding(inputs) sentence_vecs = sentence_vecs.detach().tolist() diff --git a/modelscope/pipelines/nlp/llm_pipeline.py b/modelscope/pipelines/nlp/llm_pipeline.py index 5cd2dcb16..c46bb46ae 100644 --- a/modelscope/pipelines/nlp/llm_pipeline.py +++ b/modelscope/pipelines/nlp/llm_pipeline.py @@ -1,12 +1,13 @@ # Copyright (c) Alibaba, Inc. and its affiliates. import os -import os.path as osp from contextlib import contextmanager -from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union +from threading import Lock +from typing import Any, Callable, Dict, Generator, Iterator, List, Tuple, Union import json +import numpy as np import torch -from transformers import PreTrainedTokenizer +from transformers import AutoConfig, PreTrainedModel, PreTrainedTokenizer from modelscope import (AutoModelForCausalLM, AutoTokenizer, Pipeline, snapshot_download) @@ -14,79 +15,84 @@ from modelscope.models.base import Model from modelscope.models.nlp import ChatGLM2Tokenizer, Llama2Tokenizer from modelscope.outputs import OutputKeys +from modelscope.pipelines.base import Input from modelscope.pipelines.builder import PIPELINES from modelscope.pipelines.util import is_model, is_official_hub_path from modelscope.utils.config import Config -from modelscope.utils.constant import Invoke, ModelFile, Tasks +from modelscope.utils.constant import Frameworks, Invoke, ModelFile, Tasks +from modelscope.utils.device import create_device, device_placement from modelscope.utils.logger import get_logger +from modelscope.utils.model_type_helper import ModelTypeHelper +from modelscope.utils.streaming_output import (PipelineStreamingOutputMixin, + StreamingOutputMixin, + add_stream_generate) logger = get_logger() +SWIFT_MODEL_ID_MAPPING = {} +SWIFT_FRAMEWORK = 'swift' -class ModelTypeHelper: - @staticmethod - def _get_file_name(model: str, cfg_name: str, - revision: Optional[str]) -> Optional[str]: - if osp.exists(model): - return osp.join(model, cfg_name) - try: - return model_file_download(model, cfg_name, revision=revision) - except Exception: - return None +class LLMAdapterRegistry: - @staticmethod - def _parse_and_get(file: Optional[str], pattern: str) -> Optional[str]: - if file is None or not osp.exists(file): - return None - return Config.from_file(file).safe_get(pattern) + llm_format_map = {'qwen': [None, None, None]} @classmethod - def _get(cls, model: str, revision: Optional[str]) -> Optional[str]: - cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision) - hf_cfg_file = cls._get_file_name(model, ModelFile.CONFIG, revision) - cfg_model_type = cls._parse_and_get(cfg_file, 'model.type') - hf_cfg_model_type = cls._parse_and_get(hf_cfg_file, 'model_type') - return cfg_model_type or hf_cfg_model_type + def _add_to_map(cls, model_type: str, value_index: int = 0, member=None): + assert model_type or ModelTypeHelper.current_model_type + if model_type is None: + model_type = ModelTypeHelper.current_model_type + if model_type not in cls.llm_format_map: + cls.llm_format_map[model_type] = [None, None, None] + assert cls.llm_format_map[model_type][value_index] is None + cls.llm_format_map[model_type][value_index] = member + return member @classmethod - def _get_adapter(cls, model: str, - revision: Optional[str]) -> Optional[str]: - cfg_file = cls._get_file_name(model, ModelFile.CONFIGURATION, revision) - model = cls._parse_and_get(cfg_file, 'adapter_cfg.model_id_or_path') - revision = cls._parse_and_get(cfg_file, 'adapter_cfg.model_revision') - return None if model is None else cls._get(model, revision) + def _wrapper(cls, model_type: str, value_index: int = 0, member=None): + if member is not None: + return cls._add_to_map(model_type, value_index, member) + + def _register(member): + return cls._add_to_map(model_type, value_index, member) + + return _register @classmethod - def get(cls, - model: str, - revision: Optional[str] = None, - with_adapter: bool = False, - split: Optional[str] = None) -> Optional[str]: - model_type = cls._get(model, revision) - if model_type is None and with_adapter: - model_type = cls._get_adapter(model, revision) - if model_type is None: - return None - model_type = model_type.lower() - if split is None: - return model_type - return model_type.split(split)[0] + def register_format_messages(cls, model_type: str = None, function=None): + return cls._wrapper(model_type, 0, function) + + @classmethod + def register_format_output(cls, model_type: str = None, function=None): + return cls._wrapper(model_type, 1, function) + + @classmethod + def register_tokenizer(cls, model_type: str = None, tokenizer_class=None): + return cls._wrapper(model_type, 2, tokenizer_class) + + @classmethod + def contains(cls, model_name: str) -> bool: + return model_name in cls.llm_format_map + + @classmethod + def get(cls, model_name: str) -> bool: + return cls.llm_format_map[model_name] @PIPELINES.register_module(Tasks.chat, module_name='llm') @PIPELINES.register_module(Tasks.text_generation, module_name='llm') -class LLMPipeline(Pipeline): +class LLMPipeline(Pipeline, PipelineStreamingOutputMixin): def initiate_single_model(self, model): + from swift import Swift + if isinstance(model, str): logger.info(f'initiate model from {model}') if self._is_swift_model(model): if self.llm_framework is not None: logger.warning( - f'Cannot using swift with llm_framework, ignoring {self.llm_framework}.' + f'Cannot use swift with llm_framework, ignoring {self.llm_framework}.' ) - from swift import Swift base_model = self.cfg.safe_get('adapter_cfg.model_id_or_path') assert base_model is not None, 'Cannot get adapter_cfg.model_id_or_path from configuration.json file.' @@ -104,12 +110,20 @@ def initiate_single_model(self, model): if isinstance(model, str) and is_official_hub_path(model): logger.info(f'initiate model from location {model}.') - if self.llm_framework is not None: + if self.llm_framework: model_dir = model if os.path.exists( model) else snapshot_download(model) - return self._wrap_infer_framework(model_dir, - self.llm_framework) - elif is_model(model): + try: + model = self._wrap_infer_framework(model_dir, + self.llm_framework) + logger.info(f'initiate model with {self.llm_framework}.') + return model + except Exception as e: + logger.warning( + f'Cannot using llm_framework with {model}, ' + f'ignoring llm_framework={self.llm_framework} : {e}') + self.llm_framework = None + if is_model(model): return Model.from_pretrained( model, invoked_by=Invoke.PIPELINE, @@ -142,7 +156,8 @@ def _is_swift_model(self, model: Union[str, Any]) -> bool: return False self.cfg = Config.from_file(cfg_file) - return self.cfg.safe_get('adapter_cfg.tuner_backend') == 'swift' + return self.cfg.safe_get( + 'adapter_cfg.tuner_backend') == SWIFT_FRAMEWORK def _wrap_infer_framework(self, model_dir, framework='vllm'): from modelscope.pipelines.accelerate.base import InferFramework @@ -157,26 +172,40 @@ def __init__(self, **kwargs): self.device_map = kwargs.pop('device_map', None) self.llm_framework = llm_framework - # TODO: qwen-int4 need 'cuda'/'auto' device_map. - if not self.device_map and 'qwen' in kwargs['model'].lower(): - self.device_map = 'cuda' + + if os.path.exists(kwargs['model']): + config = AutoConfig.from_pretrained( + kwargs['model'], trust_remote_code=True) + q_config = config.__dict__.get('quantization_config', None) + if q_config: + if q_config.get( + 'quant_method', + 'gptq') == 'gptq' and torch.cuda.device_count(): + self.device_map = 'cuda' + self.torch_dtype = kwargs.pop('torch_dtype', None) self.ignore_file_pattern = kwargs.pop('ignore_file_pattern', None) + + if llm_framework == SWIFT_FRAMEWORK: + self._init_swift(kwargs['model'], kwargs.get('device', 'gpu')) + return with self._temp_configuration_file(kwargs): super().__init__(*args, **kwargs) + if isinstance(self.model, PreTrainedModel): + self.model = add_stream_generate(self.model) tokenizer_class = None if isinstance(format_messages, str): - assert format_messages in LLM_FORMAT_MAP, \ + assert LLMAdapterRegistry.contains(format_messages), \ f'Can not find function for `{format_messages}`!' - format_messages, format_output, tokenizer_class = LLM_FORMAT_MAP[ - format_messages] + format_messages, format_output, tokenizer_class = \ + LLMAdapterRegistry.get(format_messages) if format_messages is None: model_type = ModelTypeHelper.get(self.model.model_dir, split='-') - if model_type in LLM_FORMAT_MAP: - format_messages, format_output, tokenizer_class = LLM_FORMAT_MAP[ - model_type] + if LLMAdapterRegistry.contains(model_type): + format_messages, format_output, tokenizer_class = \ + LLMAdapterRegistry.get(model_type) if format_messages is not None: self.format_messages = format_messages @@ -185,6 +214,73 @@ def __init__(self, self.tokenizer = self._get_tokenizer( tokenizer_class) if tokenizer is None else tokenizer + def _init_swift(self, model_id, device) -> None: + from swift.llm import prepare_model_template + from swift.llm.utils import MODEL_MAPPING, InferArguments + + global SWIFT_MODEL_ID_MAPPING + if not SWIFT_MODEL_ID_MAPPING: + SWIFT_MODEL_ID_MAPPING = { + v['model_id_or_path']: k + for k, v in MODEL_MAPPING.items() + } + + def format_messages(messages: Dict[str, List[Dict[str, str]]], + tokenizer: PreTrainedTokenizer, + **kwargs) -> Dict[str, torch.Tensor]: + inputs, _ = self.template.encode(get_example(messages)) + inputs.pop('labels', None) + if 'input_ids' in inputs: + input_ids = torch.tensor(inputs['input_ids'])[None] + inputs['input_ids'] = input_ids + token_len = input_ids.shape[1] + if 'inputs_embeds' in inputs: + inputs_embeds = inputs['inputs_embeds'][None] + inputs['inputs_embeds'] = inputs_embeds + token_len = inputs_embeds.shape[1] + inputs['attention_mask'] = torch.ones(token_len)[None] + if 'token_type_ids' in inputs: + inputs['token_type_ids'] = torch.tensor( + inputs['token_type_ids'])[None] + return inputs + + def get_example( + messages: Dict[str, List[Dict[str, str]]]) -> Dict[str, str]: + messages = messages['messages'] + assert len(messages) > 0, 'messages cannot be empty!' + system = None + if messages[0]['role'] == 'system': + system = messages[0]['content'] + messages = messages[1:] + assert len(messages) % 2 == 1, 'Unsupported messages format!' + contents = [message['content'] for message in messages] + prompt = contents[-1] + history = list(zip(contents[::2], contents[1::2])) + if self.llm_framework == SWIFT_FRAMEWORK: + return dict(system=system, query=prompt, history=history) + else: + return dict(system=system, prompt=prompt, history=history) + + assert model_id in SWIFT_MODEL_ID_MAPPING,\ + f'Invalid model id {model_id} or Swift framework does not support this model.' + args = InferArguments(model_type=SWIFT_MODEL_ID_MAPPING[model_id]) + model, template = prepare_model_template( + args, device_map=self.device_map) + self.model = add_stream_generate(model) + template.model = self.model + self.template = template + self.tokenizer = template.tokenizer + self.format_messages = format_messages + + self.has_multiple_models = False + self.framework = Frameworks.torch + self.device_name = device + self.device = create_device(device) + self._model_prepare = False + self._model_prepare_lock = Lock() + self._auto_collate = True + self._compile = False + @contextmanager def _temp_configuration_file(self, kwargs: Dict[str, Any]): kwargs['model'] = model = self.initiate_single_model(kwargs['model']) @@ -203,10 +299,11 @@ def _process_single(self, inputs, *args, **kwargs) -> Dict[str, Any]: forward_params = kwargs.get('forward_params', {}) postprocess_params = kwargs.get('postprocess_params', {}) - is_messages = isinstance(inputs, dict) and 'messages' in inputs - tokens = self.preprocess(inputs, is_messages, **preprocess_params) + preprocess_params['is_messages'] = postprocess_params['is_messages'] \ + = isinstance(inputs, dict) and 'messages' in inputs + tokens = self.preprocess(inputs, **preprocess_params) - if self.llm_framework is None: + if self.llm_framework in (None, SWIFT_FRAMEWORK): # pytorch model if hasattr(self.model, 'generate'): outputs = self.model.generate(**tokens, **forward_params) @@ -219,14 +316,65 @@ def _process_single(self, inputs, *args, **kwargs) -> Dict[str, Any]: tokens = [list(tokens['inputs'].flatten().numpy())] outputs = self.model(tokens, **forward_params)[0] - if self.llm_framework is None: + if self.llm_framework in (None, SWIFT_FRAMEWORK): # pytorch model outputs = outputs.tolist()[0][len(tokens['inputs'][0]):] - response = self.postprocess(outputs, is_messages, **postprocess_params) + response = self.postprocess(outputs, **postprocess_params) return response - def preprocess(self, inputs: Union[str, Dict], is_messages: bool, - **kwargs): + def stream_generate(self, inputs: Union[Input, List[Input]], *args, + **kwargs) -> Generator: + assert isinstance(self.model, StreamingOutputMixin + ), 'pipeline.model must be StreamingOutputMixin!' + if (self.model or (self.has_multiple_models and self.models[0])): + if not self._model_prepare: + self.prepare_model() + + preprocess_params, forward_params, postprocess_params = self._sanitize_parameters( + **kwargs) + preprocess_params['is_messages'] = postprocess_params['is_messages'] \ + = isinstance(inputs, dict) and 'messages' in inputs + + if isinstance(inputs, list): + model_input_list = [ + self._preprocess_with_check(i, preprocess_params) + for i in inputs + ] + output = [] + for ele in model_input_list: + output.append( + self._stream_single(ele, forward_params, + postprocess_params)) + else: + model_input = self._preprocess_with_check(inputs, + preprocess_params) + output = self._stream_single(model_input, forward_params, + postprocess_params) + return output + + def _stream_single(self, model_input: Dict[str, Any], + forward_params: Dict[str, Any], + postprocess_params: Dict[str, Any]) -> Generator: + + with device_placement(self.framework, self.device_name): + if self.framework == Frameworks.torch: + with torch.no_grad(): + if self._auto_collate: + model_input = self._collate_fn(model_input) + stream = self.model.stream_generate( + **model_input, **forward_params) + else: + stream = self.model.stream_generate(**model_input, + **forward_params) + + for out in stream: + out = out.tolist()[0][len(model_input['inputs'][0]):] + out = self.postprocess(out, **postprocess_params) + self._check_output(out) + yield out + + def preprocess(self, inputs: Union[str, Dict], **kwargs): + is_messages = kwargs.pop('is_messages') if is_messages: tokens = self.format_messages(inputs, self.tokenizer, **kwargs) else: @@ -244,13 +392,16 @@ def preprocess(self, inputs: Union[str, Dict], is_messages: bool, else: raise ValueError('model does not have `device` attribute!') return { - k: (v.to(device) if isinstance(v, torch.Tensor) else v) + k: (v.to(device) if torch.is_tensor(v) else v) for k, v in tokens.items() } - def postprocess(self, outputs, is_messages: bool, **kwargs): - + def postprocess(self, outputs, **kwargs): + is_messages = kwargs.pop('is_messages') if not isinstance(outputs, str): + shape_type = (torch.Tensor, np.ndarray) + if isinstance(outputs, shape_type) and len(outputs.shape) > 1: + outputs = outputs[0] response = self.tokenizer.decode( outputs, skip_special_tokens=True, **kwargs) else: @@ -356,6 +507,7 @@ def _concat(ids: List[int], *args: Union[int, List[int]]) -> List[int]: return ids +@LLMAdapterRegistry.register_format_messages('chatglm2') def chatglm2_format_messages(messages, tokenizer, **kwargs): def build_chatglm2_prompt(messages, **kwargs): @@ -376,6 +528,8 @@ def build_chatglm2_prompt(messages, **kwargs): return tokenizer(prompt, return_token_type_ids=False, return_tensors='pt') +@LLMAdapterRegistry.register_format_output('chatglm') +@LLMAdapterRegistry.register_format_output('chatglm2') def chatglm2_format_output(response, **kwargs): response = response.strip() response = response.replace('[[训练时间]]', '2023年') @@ -386,6 +540,8 @@ def chatglm2_format_output(response, **kwargs): return outputs +@LLMAdapterRegistry.register_format_messages('llama') +@LLMAdapterRegistry.register_format_messages('llama2') def llama2_format_messages(messages, tokenizer, **kwargs): from transformers import BatchEncoding @@ -437,6 +593,8 @@ def build_llama2_prompt(messages, tokenizer, **kwargs): return BatchEncoding({'input_ids': tokens}) +@LLMAdapterRegistry.register_format_messages('baichuan') +@LLMAdapterRegistry.register_format_messages('baichuan2') def baichuan_format_messages(messages, tokenizer, **kwargs): from transformers import BatchEncoding @@ -490,6 +648,7 @@ def _parse_messages(messages, split_role='user'): return BatchEncoding({'input_ids': input_tokens}) +@LLMAdapterRegistry.register_format_messages('wizardlm') def wizardlm_format_messages(messages, tokenizer, **kwargs): def build_wizardlm_prompt(messages, tokenizer, **kwargs): @@ -520,6 +679,7 @@ def build_wizardlm_prompt(messages, tokenizer, **kwargs): return tokenizer(prompts, return_token_type_ids=False, return_tensors='pt') +@LLMAdapterRegistry.register_format_messages('wizardcode') def wizardcode_format_messages(messages, tokenizer, **kwargs): messages = messages['messages'] assert len(messages) == 2, 'wizard code only support two messages.' @@ -542,6 +702,7 @@ def wizardcode_format_messages(messages, tokenizer, **kwargs): return inputs +@LLMAdapterRegistry.register_format_messages('chatglm') def chatglm3_format_messages(messages, tokenizer, **kwargs): messages = messages['messages'] query, history = messages[-1]['content'], messages[:-1] @@ -555,15 +716,14 @@ def chatglm3_format_messages(messages, tokenizer, **kwargs): return inputs -LLM_FORMAT_MAP = { - 'chatglm2': - (chatglm2_format_messages, chatglm2_format_output, ChatGLM2Tokenizer), - 'qwen': (LLMPipeline.format_messages, LLMPipeline.format_output, None), - 'llama2': (llama2_format_messages, None, Llama2Tokenizer), - 'llama': (llama2_format_messages, None, Llama2Tokenizer), - 'baichuan': (baichuan_format_messages, None, None), - 'baichuan2': (baichuan_format_messages, None, None), - 'wizardlm': (wizardlm_format_messages, None, None), - 'wizardcode': (wizardcode_format_messages, None, None), - 'chatglm': (chatglm3_format_messages, chatglm2_format_output, None), -} +@LLMAdapterRegistry.register_format_messages('qwen2') +def qwen2_format_messages(messages, tokenizer, **kwargs): + messages = messages['messages'] + text = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True) + return tokenizer([text], return_tensors='pt') + + +LLMAdapterRegistry.register_tokenizer('chatglm2', ChatGLM2Tokenizer) +LLMAdapterRegistry.register_tokenizer('llama', Llama2Tokenizer) +LLMAdapterRegistry.register_tokenizer('llama2', Llama2Tokenizer) diff --git a/modelscope/pipelines/nlp/table_question_answering_pipeline.py b/modelscope/pipelines/nlp/table_question_answering_pipeline.py index 7c064f579..7e281a0ad 100644 --- a/modelscope/pipelines/nlp/table_question_answering_pipeline.py +++ b/modelscope/pipelines/nlp/table_question_answering_pipeline.py @@ -62,7 +62,7 @@ def __init__(self, self.preprocessor = TableQuestionAnsweringPreprocessor( self.model.model_dir, **kwargs) - # initilize tokenizer + # initialize tokenizer self.tokenizer = BertTokenizer( os.path.join(self.model.model_dir, ModelFile.VOCAB_FILE)) diff --git a/modelscope/pipelines/nlp/text_error_correction_pipeline.py b/modelscope/pipelines/nlp/text_error_correction_pipeline.py index dc4bc40a4..9fa5a2a8c 100644 --- a/modelscope/pipelines/nlp/text_error_correction_pipeline.py +++ b/modelscope/pipelines/nlp/text_error_correction_pipeline.py @@ -80,7 +80,8 @@ def postprocess(self, inputs: Dict[str, Tensor], sc_tensor = inputs['predictions'] if isinstance(sc_tensor, list): - sc_tensor = sc_tensor[0] + if isinstance(sc_tensor[0], list): + sc_tensor = sc_tensor[0] sc_sent = self.vocab.string( sc_tensor, extra_symbols_to_ignore={self.vocab.pad()}) sc_sent = (sc_sent + ' ').replace('##', '').rstrip() diff --git a/modelscope/pipelines/nlp/text_generation_pipeline.py b/modelscope/pipelines/nlp/text_generation_pipeline.py index 1015d3112..55eaf8091 100644 --- a/modelscope/pipelines/nlp/text_generation_pipeline.py +++ b/modelscope/pipelines/nlp/text_generation_pipeline.py @@ -18,9 +18,12 @@ from modelscope.utils.chinese_utils import remove_space_between_chinese_chars from modelscope.utils.constant import ModelFile, Tasks from modelscope.utils.hub import Config, read_config +from modelscope.utils.logger import get_logger from modelscope.utils.streaming_output import PipelineStreamingOutputMixin from modelscope.utils.torch_utils import is_on_same_device +logger = get_logger() + __all__ = [ 'TextGenerationPipeline', 'TextGenerationT5Pipeline', 'ChatGLM6bTextGenerationPipeline', 'ChatGLM6bV2TextGenerationPipeline', @@ -86,14 +89,24 @@ def __init__(self, self.postprocessor = cfg.get('postprocessor') if self.postprocessor is None: self.postprocessor = 'decode' + self.has_logged = False def _sanitize_parameters(self, **pipeline_parameters): return {}, pipeline_parameters, {} - def forward(self, inputs: Dict[str, Any], + def forward(self, inputs: Union[Dict[str, Any], Tensor], **forward_params) -> Dict[str, Any]: with torch.no_grad(): - return self.model.generate(inputs, **forward_params) + try: + return self.model.generate(inputs, **forward_params) + except AttributeError as e: + if not self.has_logged: + logger.warning( + 'When inputs are passed directly, ' + f'the error is {e}, ' + 'which can be ignored if it runs correctly.') + self.has_logged = True + return self.model.generate(**inputs, **forward_params) def decode(self, inputs) -> str: return self.preprocessor.decode( @@ -451,7 +464,7 @@ def forward(self, prompt: str, **forward_params) -> Dict[str, Any]: padding=True, truncation=True, max_length=1024) - input_ids = input_ids.input_ids.cuda() + input_ids = input_ids.input_ids.to(self.model.device) outputs = self.model.generate( input_ids, num_beams=4, do_sample=False, max_new_tokens=256) decoded_sentences = self.tokenizer.batch_decode( diff --git a/modelscope/pipelines/nlp/translation_pipeline.py b/modelscope/pipelines/nlp/translation_pipeline.py index 8750cd3bf..7e1dfd057 100644 --- a/modelscope/pipelines/nlp/translation_pipeline.py +++ b/modelscope/pipelines/nlp/translation_pipeline.py @@ -51,14 +51,12 @@ def __init__(self, model: Model, **kwargs): self._src_vocab_path = osp.join( model, self.cfg['dataset']['src_vocab']['file']) - self._src_vocab = dict([ - (w.strip(), i) for i, w in enumerate(open(self._src_vocab_path)) - ]) + self._src_vocab = dict([(w.strip(), i) for i, w in enumerate( + open(self._src_vocab_path, encoding='utf-8'))]) self._trg_vocab_path = osp.join( model, self.cfg['dataset']['trg_vocab']['file']) - self._trg_rvocab = dict([ - (i, w.strip()) for i, w in enumerate(open(self._trg_vocab_path)) - ]) + self._trg_rvocab = dict([(i, w.strip()) for i, w in enumerate( + open(self._trg_vocab_path, encoding='utf-8'))]) tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True @@ -81,7 +79,7 @@ def __init__(self, model: Model, **kwargs): self._tok = MosesTokenizer(lang=self._src_lang) self._detok = MosesDetokenizer(lang=self._tgt_lang) - self._bpe = apply_bpe.BPE(open(self._src_bpe_path)) + self._bpe = apply_bpe.BPE(open(self._src_bpe_path, encoding='utf-8')) # model output = self.model(self.input_wids) diff --git a/modelscope/preprocessors/ofa/asr.py b/modelscope/preprocessors/ofa/asr.py index 5d36b829c..da953299d 100644 --- a/modelscope/preprocessors/ofa/asr.py +++ b/modelscope/preprocessors/ofa/asr.py @@ -56,7 +56,7 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: speed = random.choice([0.9, 1.0, 1.1]) audio_bytes = self.get_audio_bytes(data[self.column_map['wav']]) - wav, sr = librosa.load(audio_bytes, 16000, mono=True) + wav, sr = librosa.load(audio_bytes, sr=16000, mono=True) fbank = self.prepare_fbank( torch.tensor([wav], dtype=torch.float32), sr, @@ -94,7 +94,7 @@ def _build_train_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: def _build_infer_sample(self, data: Dict[str, Any]) -> Dict[str, Any]: speed = 1.0 audio_bytes = self.get_audio_bytes(data[self.column_map['wav']]) - wav, sr = librosa.load(audio_bytes, 16000, mono=True) + wav, sr = librosa.load(audio_bytes, sr=16000, mono=True) fbank = self.prepare_fbank( torch.tensor([wav], dtype=torch.float32), sr, diff --git a/modelscope/preprocessors/templates/__init__.py b/modelscope/preprocessors/templates/__init__.py new file mode 100644 index 000000000..5ac1780df --- /dev/null +++ b/modelscope/preprocessors/templates/__init__.py @@ -0,0 +1,2 @@ +from .base import Template, get_template +from .template import TemplateType diff --git a/modelscope/preprocessors/templates/base.py b/modelscope/preprocessors/templates/base.py new file mode 100644 index 000000000..4504a4bc7 --- /dev/null +++ b/modelscope/preprocessors/templates/base.py @@ -0,0 +1,1041 @@ +# Copyright (c) Alibaba, Inc. and its affiliates. +import json +import re +from copy import deepcopy +from typing import Any, Dict, List, Literal, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from modelscope import get_logger +from torch.nn import Module +from torch.nn.utils.rnn import pad_sequence +from transformers import PreTrainedTokenizerBase, StoppingCriteria +from .loss_scale import loss_scale_map +from .tools_prompt import get_tools_prompt +from .utils import load_batch, load_image, rescale_image, fetch_one, to_device, decode_base64 +from .utils import History, Prompt, StopWords, Context, Messages + +logger = get_logger() + +DEFAULT_SYSTEM = 'You are a helpful assistant.' + +TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {} + + +def get_template( + template_type: str, + tokenizer: PreTrainedTokenizerBase, + default_system: Optional[str] = None, + max_length: Optional[int] = None, + truncation_strategy: Literal['delete', 'truncation_left'] = 'delete', + **kwargs, +) -> 'Template': + template_info = TEMPLATE_MAPPING[template_type] + template = deepcopy(template_info['template']) + template.init_template(tokenizer, default_system, max_length, truncation_strategy, **kwargs) + return template + + +def _findall(token_list: List[int], sub_token_list: Union[int, List[int]]) -> List[int]: + """Find the index of a token in the token_list.""" + if isinstance(sub_token_list, int): + sub_token_list = [sub_token_list] + res = [] + idx = -1 + try: + while True: + idx = token_list.index(sub_token_list[0], idx + 1) + if len(sub_token_list) == 1 or sub_token_list == token_list[idx:idx + len(sub_token_list)]: + res.append(idx) + except ValueError: + pass + return res + + +def replace_img_tag(messages: Messages, + replace_token: str, + pattern=r'(.+?)') -> Tuple[str, History, List[str]]: + images_path = [] + new_messages = [] + for i, m in enumerate(messages): + m = m.copy() + if m['content'] is None or m['role'] in ('tool', 'system', 'assistant'): + new_messages.append(m) + else: + images_path += re.findall(pattern, m['content']) + m['content'] = re.sub(pattern, replace_token, m['content']) + new_messages.append(m) + return messages, images_path + + +class StopWordsCriteria(StoppingCriteria): + """Adding extra stop words in template to prevent unstoppable generation + Like suffixes and chat seps in the template. + """ + def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_words: StopWords, **tokenizer_kwargs) -> None: + self.tokenizer = tokenizer + self.stop_words = stop_words + self.tokenizer_kwargs = tokenizer_kwargs + self.start_idx = -1 + + def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, **kwargs) -> bool: + if self.start_idx == -1: + self.start_idx = len(input_ids[0]) - 1 + tokenizer = self.tokenizer + stop_words = self.stop_words + # [-20:]: Assuming the end tokens do not exceed 20 tokens, + # to avoid input_ids being too long and affecting efficiency. + text = tokenizer.decode(input_ids[0, self.start_idx:][-20:], **self.tokenizer_kwargs) + for stop_word in stop_words: + if isinstance(stop_word, str): + if stop_word in text: + return True + else: # list + if len(stop_word) > 0 and input_ids[0].tolist()[-len(stop_word):] == stop_word: + return True + return False + + +class Template: + """A template class for all supported models. + + Args: + prefix: Prefix tokens before the first turn's prompt + prompt: A list of elements whose types are str and list of integers. The input query part of every turn. + chat_sep: The chat separators between every turn. + suffix: The end tokens after the chat finished. + default_system: A default system instruction. + system_prefix: The prefix if the `system` is not empty. + auto_add_bos: By default, the bos_token is not added. The auto_add_bos option will determine + whether to add it based on `tokenizer.encode('')`. + tools_prompt: The tools prompt name + tool_prompt: The tool prompt, usually useful when there is a tool role + padding_side: The padding side + infer_media_type: The media type supported by the multi-modals + Examples: + system\nYou are a helpful assistant!\nWho are you?\nassistant:I am a robot\nWho are you?\nassistant:I am a robot # noqa + ----------system------------ ---query---- --response- -----chatsep----- ---query--- --response- ----suffix----- + ----------------------------system_prefix---------------------------- ---------------------------- prompt ------------------------------------- ---------------------------- prompt ------------------------------------- + + """ + + special_tokens = ['', '