From 4b1771c89d305a102007c35026991f16fe3d14ad Mon Sep 17 00:00:00 2001 From: walsvid Date: Thu, 14 Jul 2022 17:24:57 +0800 Subject: [PATCH] add single depth to 3d hand keypoints, add nyu hand dataset and awr network --- configs/_base_/datasets/nyu.py | 92 ++++ .../awr/nyu/res50_nyu_all_128x128.py | 177 +++++++ mmpose/core/evaluation/top_down_eval.py | 35 ++ mmpose/datasets/datasets/base/__init__.py | 12 +- ...kpt_3d_sview_depth_img_top_down_dataset.py | 400 +++++++++++++++ mmpose/datasets/datasets/hand/__init__.py | 3 +- .../datasets/datasets/hand/nyuhand_dataset.py | 168 +++++++ mmpose/datasets/pipelines/hand_transform.py | 87 ++++ .../datasets/pipelines/top_down_transform.py | 45 +- mmpose/models/backbones/__init__.py | 4 +- mmpose/models/backbones/awr_resnet.py | 33 ++ mmpose/models/detectors/__init__.py | 4 +- mmpose/models/detectors/depthhand_3d.py | 303 ++++++++++++ mmpose/models/heads/__init__.py | 3 +- mmpose/models/heads/awr_head.py | 458 ++++++++++++++++++ mmpose/models/losses/__init__.py | 9 +- mmpose/models/losses/regression_loss.py | 43 ++ tests/test_models/test_awr_3d_head.py | 91 ++++ .../test_models/test_depthhand_3d_forward.py | 108 +++++ 19 files changed, 2060 insertions(+), 15 deletions(-) create mode 100644 configs/_base_/datasets/nyu.py create mode 100644 configs/hand/3d_kpt_sview_depth_img/awr/nyu/res50_nyu_all_128x128.py create mode 100644 mmpose/datasets/datasets/base/kpt_3d_sview_depth_img_top_down_dataset.py create mode 100644 mmpose/datasets/datasets/hand/nyuhand_dataset.py create mode 100644 mmpose/models/backbones/awr_resnet.py create mode 100644 mmpose/models/detectors/depthhand_3d.py create mode 100644 mmpose/models/heads/awr_head.py create mode 100644 tests/test_models/test_awr_3d_head.py create mode 100644 tests/test_models/test_depthhand_3d_forward.py diff --git a/configs/_base_/datasets/nyu.py b/configs/_base_/datasets/nyu.py new file mode 100644 index 0000000000..cf1acb0a87 --- /dev/null +++ b/configs/_base_/datasets/nyu.py @@ -0,0 +1,92 @@ +dataset_info = dict( + dataset_name='nyu', + paper_info=dict( + author='Jonathan Tompson and Murphy Stein and Yann Lecun and ' + 'Ken Perlin', + title='Real-Time Continuous Pose Recovery of Human Hands ' + 'Using Convolutional Networks', + container='ACM Transactions on Graphics', + year='2014', + homepage='https://jonathantompson.github.io/NYU_Hand_Pose_Dataset.htm', + ), + keypoint_info={ + 0: dict(name='F1_KNU3_A', id=0, color=[255, 128, 0], type='', swap=''), + 1: dict(name='F1_KNU3_B', id=1, color=[255, 128, 0], type='', swap=''), + 2: dict(name='F1_KNU2_A', id=2, color=[255, 128, 0], type='', swap=''), + 3: dict(name='F1_KNU2_B', id=3, color=[255, 128, 0], type='', swap=''), + 4: + dict(name='F1_KNU1_A', id=4, color=[255, 153, 255], type='', swap=''), + 5: + dict(name='F1_KNU1_B', id=5, color=[255, 153, 255], type='', swap=''), + 6: + dict(name='F2_KNU3_A', id=6, color=[255, 153, 255], type='', swap=''), + 7: + dict(name='F2_KNU3_B', id=7, color=[255, 153, 255], type='', swap=''), + 8: + dict(name='F2_KNU2_A', id=8, color=[102, 178, 255], type='', swap=''), + 9: + dict(name='F2_KNU2_B', id=9, color=[102, 178, 255], type='', swap=''), + 10: + dict(name='F2_KNU1_A', id=10, color=[102, 178, 255], type='', swap=''), + 11: + dict(name='F2_KNU1_B', id=11, color=[102, 178, 255], type='', swap=''), + 12: + dict(name='F3_KNU3_A', id=12, color=[255, 51, 51], type='', swap=''), + 13: + dict(name='F3_KNU3_B', id=13, color=[255, 51, 51], type='', swap=''), + 14: + dict(name='F3_KNU2_A', id=14, color=[255, 51, 51], type='', swap=''), + 15: + dict(name='F3_KNU2_B', id=15, color=[255, 51, 51], type='', swap=''), + 16: dict(name='F3_KNU1_A', id=16, color=[0, 255, 0], type='', swap=''), + 17: dict(name='F3_KNU1_B', id=17, color=[0, 255, 0], type='', swap=''), + 18: dict(name='F4_KNU3_A', id=18, color=[0, 255, 0], type='', swap=''), + 19: dict(name='F4_KNU3_B', id=19, color=[0, 255, 0], type='', swap=''), + 20: + dict(name='F4_KNU2_A', id=20, color=[255, 255, 255], type='', swap=''), + 21: + dict(name='F4_KNU2_B', id=21, color=[255, 128, 0], type='', swap=''), + 22: + dict(name='F4_KNU1_A', id=22, color=[255, 128, 0], type='', swap=''), + 23: + dict(name='F4_KNU1_B', id=23, color=[255, 128, 0], type='', swap=''), + 24: + dict(name='TH_KNU3_A', id=24, color=[255, 128, 0], type='', swap=''), + 25: + dict(name='TH_KNU3_B', id=25, color=[255, 153, 255], type='', swap=''), + 26: + dict(name='TH_KNU2_A', id=26, color=[255, 153, 255], type='', swap=''), + 27: + dict(name='TH_KNU2_B', id=27, color=[255, 153, 255], type='', swap=''), + 28: + dict(name='TH_KNU1_A', id=28, color=[255, 153, 255], type='', swap=''), + 29: + dict(name='TH_KNU1_B', id=29, color=[102, 178, 255], type='', swap=''), + 30: + dict(name='PALM_1', id=30, color=[102, 178, 255], type='', swap=''), + 31: + dict(name='PALM_2', id=31, color=[102, 178, 255], type='', swap=''), + 32: + dict(name='PALM_3', id=32, color=[102, 178, 255], type='', swap=''), + 33: dict(name='PALM_4', id=33, color=[255, 51, 51], type='', swap=''), + 34: dict(name='PALM_5', id=34, color=[255, 51, 51], type='', swap=''), + 35: dict(name='PALM_6', id=35, color=[255, 51, 51], type='', swap=''), + }, + skeleton_info={ + 0: dict(link=('PALM_3', 'F1_KNU2_B'), id=0, color=[255, 128, 0]), + 1: dict(link=('F1_KNU2_B', 'F1_KNU3_A'), id=1, color=[255, 128, 0]), + 2: dict(link=('PALM_3', 'F2_KNU2_B'), id=2, color=[255, 128, 0]), + 3: dict(link=('F2_KNU2_B', 'F2_KNU3_A'), id=3, color=[255, 128, 0]), + 4: dict(link=('PALM_3', 'F3_KNU2_B'), id=4, color=[255, 153, 255]), + 5: dict(link=('F3_KNU2_B', 'F3_KNU3_A'), id=5, color=[255, 153, 255]), + 6: dict(link=('PALM_3', 'F4_KNU2_B'), id=6, color=[255, 153, 255]), + 7: dict(link=('F4_KNU2_B', 'F4_KNU3_A'), id=7, color=[255, 153, 255]), + 8: dict(link=('PALM_3', 'TH_KNU2_B'), id=8, color=[102, 178, 255]), + 9: dict(link=('TH_KNU2_B', 'TH_KNU3_B'), id=9, color=[102, 178, 255]), + 10: + dict(link=('TH_KNU3_B', 'TH_KNU3_A'), id=10, color=[102, 178, 255]), + 11: dict(link=('PALM_3', 'PALM_1'), id=11, color=[102, 178, 255]), + 12: dict(link=('PALM_3', 'PALM_2'), id=12, color=[255, 51, 51]), + }, + joint_weights=[1.] * 36, + sigmas=[]) diff --git a/configs/hand/3d_kpt_sview_depth_img/awr/nyu/res50_nyu_all_128x128.py b/configs/hand/3d_kpt_sview_depth_img/awr/nyu/res50_nyu_all_128x128.py new file mode 100644 index 0000000000..a6d78345ae --- /dev/null +++ b/configs/hand/3d_kpt_sview_depth_img/awr/nyu/res50_nyu_all_128x128.py @@ -0,0 +1,177 @@ +_base_ = [ + '../../../../_base_/default_runtime.py', + '../../../../_base_/datasets/nyu.py' +] +checkpoint_config = dict(interval=1) +# TODO: metric +evaluation = dict( + interval=1, + metric=['MRRPE', 'MPJPE', 'Handedness_acc'], + save_best='MPJPE_all') + +optimizer = dict( + type='Adam', + lr=2e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict(policy='step', step=[15, 17]) +total_epochs = 20 +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +load_from = '/root/mmpose/data/ckpt/new_res50.pth' +used_keypoints_index = [0, 3, 6, 9, 12, 15, 18, 21, 24, 25, 27, 30, 31, 32] + +channel_cfg = dict( + num_output_channels=14, + dataset_joints=36, + dataset_channel=used_keypoints_index, + inference_channel=used_keypoints_index) + +# model settings +model = dict( + type='Depthhand3D', # pretrained=None + backbone=dict( + type='AWRResNet', + depth=50, + frozen_stages=-1, + zero_init_residual=False, + in_channels=1), + keypoint_head=dict( + type='AdaptiveWeightingRegression3DHead', + offset_head_cfg=dict( + in_channels=256, + out_channels_vector=42, + out_channels_scalar=14, + heatmap_kernel_size=1.0, + ), + deconv_head_cfg=dict( + in_channels=2048, + out_channels=256, + depth_size=64, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=dict(final_conv_kernel=0, )), + loss_offset=dict(type='AWRSmoothL1Loss', use_target_weight=False), + loss_keypoint=dict(type='AWRSmoothL1Loss', use_target_weight=True), + ), + train_cfg=dict(use_img_for_head=True), + test_cfg=dict(use_img_for_head=True, flip_test=False)) + +data_cfg = dict( + image_size=[128, 128], + heatmap_size=[64, 64, 56], + cube_size=[300, 300, 300], + heatmap_size_root=64, + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel']) + +train_pipeline = [ + dict(type='LoadImageFromFile', color_type='unchanged'), + dict(type='TopDownGetBboxCenterScale', padding=1.0), + dict(type='TopDownAffine'), + dict(type='DepthToTensor'), + dict( + type='MultitaskGatherTarget', + pipeline_list=[ + [ + dict( + type='TopDownGenerateTargetRegression', + use_zero_mean=True, + joint_indices=used_keypoints_index, + is_3d=True, + normalize_depth=True, + ), + dict( + type='HandGenerateJointToOffset', + heatmap_kernel_size=1.0, + ) + ], + [ + dict( + type='TopDownGenerateTargetRegression', + use_zero_mean=True, + joint_indices=used_keypoints_index, + is_3d=True, + normalize_depth=True, + ) + ], + ], + pipeline_indices=[0, 1], + ), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs', 'cube_size', 'center_depth', 'focal', + 'princpt', 'image_size', 'joints_cam', 'dataset_channel', + 'joints_uvd' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile', color_type='unchanged'), + dict(type='TopDownGetBboxCenterScale', padding=1.0), + dict(type='TopDownAffine'), + dict(type='DepthToTensor'), + dict( + type='Collect', + keys=['img'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs', 'cube_size', 'center_depth', 'focal', + 'princpt', 'image_size', 'joints_cam', 'dataset_channel', + 'joints_uvd' + ]) +] + +test_pipeline = val_pipeline + +data_root = 'data/nyu' +data = dict( + samples_per_gpu=4, + workers_per_gpu=0, + shuffle=False, + train=dict( + type='NYUHandDataset', + ann_file=f'{data_root}/annotations/nyu_test_data.json', + camera_file=f'{data_root}/annotations/nyu_test_camera.json', + joint_file=f'{data_root}/annotations/nyu_test_joint_3d.json', + img_prefix=f'{data_root}/images/test/', + data_cfg=data_cfg, + use_refined_center=False, + align_uvd_xyz_direction=True, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='NYUHandDataset', + ann_file=f'{data_root}/annotations/nyu_test_data.json', + camera_file=f'{data_root}/annotations/nyu_test_camera.json', + joint_file=f'{data_root}/annotations/nyu_test_joint_3d.json', + img_prefix=f'{data_root}/images/test/', + data_cfg=data_cfg, + use_refined_center=False, + align_uvd_xyz_direction=True, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='NYUHandDataset', + ann_file=f'{data_root}/annotations/nyu_test_data.json', + camera_file=f'{data_root}/annotations/nyu_test_camera.json', + joint_file=f'{data_root}/annotations/nyu_test_joint_3d.json', + img_prefix=f'{data_root}/images/test/', + data_cfg=data_cfg, + use_refined_center=False, + align_uvd_xyz_direction=True, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/mmpose/core/evaluation/top_down_eval.py b/mmpose/core/evaluation/top_down_eval.py index ee6a2501cf..21c6b8c6b3 100644 --- a/mmpose/core/evaluation/top_down_eval.py +++ b/mmpose/core/evaluation/top_down_eval.py @@ -655,6 +655,41 @@ def keypoints_from_heatmaps3d(heatmaps, center, scale): return preds, maxvals +def keypoints_from_joint_uvd(joint_uvd, center, scale, image_size): + """Get final keypoint predictions from 3d heatmaps and transform them back + to the image. + + Note: + - batch size: N + - num keypoints: K + - heatmap depth size: D + - heatmap height: H + - heatmap width: W + + Args: + heatmaps (np.ndarray[N, K, D, H, W]): model predicted heatmaps. + center (np.ndarray[N, 2]): Center of the bounding box (x, y). + scale (np.ndarray[N, 2]): Scale of the bounding box + wrt height/width. + + Returns: + tuple: A tuple containing keypoint predictions and scores. + + - preds (np.ndarray[N, K, 3]): Predicted 3d keypoint location \ + in images. + - maxvals (np.ndarray[N, K, 1]): Scores (confidence) of the keypoints. + """ + N, K, D = joint_uvd.shape + preds = joint_uvd + maxvals = np.ones((N, K, 1), dtype=np.float32) + # Transform back to the image + for i in range(N): + preds[i, :, :2] = transform_preds( + (preds[i, :, :2] + 1) * image_size[i] / 2, center[i], scale[i], + [image_size[i, 1], image_size[i, 0]]) + return preds, maxvals + + def multilabel_classification_accuracy(pred, gt, mask, thr=0.5): """Get multi-label classification accuracy. diff --git a/mmpose/datasets/datasets/base/__init__.py b/mmpose/datasets/datasets/base/__init__.py index e5f9a0899c..75d6901f2a 100644 --- a/mmpose/datasets/datasets/base/__init__.py +++ b/mmpose/datasets/datasets/base/__init__.py @@ -6,12 +6,18 @@ from .kpt_2d_sview_rgb_vid_top_down_dataset import \ Kpt2dSviewRgbVidTopDownDataset from .kpt_3d_mview_rgb_img_direct_dataset import Kpt3dMviewRgbImgDirectDataset +from .kpt_3d_sview_depth_img_top_down_dataset import \ + Kpt3dSviewDepthImgTopDownDataset from .kpt_3d_sview_kpt_2d_dataset import Kpt3dSviewKpt2dDataset from .kpt_3d_sview_rgb_img_top_down_dataset import \ Kpt3dSviewRgbImgTopDownDataset __all__ = [ - 'Kpt3dMviewRgbImgDirectDataset', 'Kpt2dSviewRgbImgTopDownDataset', - 'Kpt3dSviewRgbImgTopDownDataset', 'Kpt2dSviewRgbImgBottomUpDataset', - 'Kpt3dSviewKpt2dDataset', 'Kpt2dSviewRgbVidTopDownDataset' + 'Kpt3dMviewRgbImgDirectDataset', + 'Kpt2dSviewRgbImgTopDownDataset', + 'Kpt3dSviewRgbImgTopDownDataset', + 'Kpt2dSviewRgbImgBottomUpDataset', + 'Kpt3dSviewKpt2dDataset', + 'Kpt2dSviewRgbVidTopDownDataset', + 'Kpt3dSviewDepthImgTopDownDataset', ] diff --git a/mmpose/datasets/datasets/base/kpt_3d_sview_depth_img_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_3d_sview_depth_img_top_down_dataset.py new file mode 100644 index 0000000000..73a72e8c7e --- /dev/null +++ b/mmpose/datasets/datasets/base/kpt_3d_sview_depth_img_top_down_dataset.py @@ -0,0 +1,400 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from abc import ABCMeta, abstractmethod + +import json_tricks as json +import numpy as np +from torch.utils.data import Dataset +from xtcocotools.coco import COCO + +from mmpose.core.evaluation.top_down_eval import (keypoint_auc, keypoint_epe, + keypoint_pck_accuracy) +from mmpose.datasets import DatasetInfo +from mmpose.datasets.pipelines import Compose + + +class Kpt3dSviewDepthImgTopDownDataset(Dataset, metaclass=ABCMeta): + """Base class for keypoint 3D top-down pose estimation with single-view + depth image as the input. + + All depth-based datasets should subclass it. + All subclasses should overwrite: + Methods:`_get_db`, 'evaluate' + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + coco_style (bool): Whether the annotation json is coco-style. + Default: True + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + coco_style=True, + test_mode=False): + + self.image_info = {} + self.ann_info = {} + + self.ann_file = ann_file + self.img_prefix = img_prefix + self.pipeline = pipeline + self.test_mode = test_mode + + self.ann_info['image_size'] = np.array(data_cfg['image_size']) + self.ann_info['heatmap_size'] = np.array(data_cfg['heatmap_size']) + self.ann_info['num_joints'] = data_cfg['num_joints'] + + self.ann_info['inference_channel'] = data_cfg['inference_channel'] + self.ann_info['num_output_channels'] = data_cfg['num_output_channels'] + self.ann_info['dataset_channel'] = data_cfg['dataset_channel'] + + if dataset_info is None: + raise ValueError( + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.') + + dataset_info = DatasetInfo(dataset_info) + + assert self.ann_info['num_joints'] == dataset_info.keypoint_num + self.ann_info['flip_pairs'] = dataset_info.flip_pairs + self.ann_info['flip_index'] = dataset_info.flip_index + self.ann_info['upper_body_ids'] = dataset_info.upper_body_ids + self.ann_info['lower_body_ids'] = dataset_info.lower_body_ids + self.ann_info['joint_weights'] = dataset_info.joint_weights + self.ann_info['skeleton'] = dataset_info.skeleton + self.sigmas = dataset_info.sigmas + self.dataset_name = dataset_info.dataset_name + + if coco_style: + self.coco = COCO(ann_file) + if 'categories' in self.coco.dataset: + cats = [ + cat['name'] + for cat in self.coco.loadCats(self.coco.getCatIds()) + ] + self.classes = ['__background__'] + cats + self.num_classes = len(self.classes) + self._class_to_ind = dict( + zip(self.classes, range(self.num_classes))) + self._class_to_coco_ind = dict( + zip(cats, self.coco.getCatIds())) + self._coco_ind_to_class_ind = dict( + (self._class_to_coco_ind[cls], self._class_to_ind[cls]) + for cls in self.classes[1:]) + self.img_ids = self.coco.getImgIds() + self.num_images = len(self.img_ids) + self.id2name, self.name2id = self._get_mapping_id_name( + self.coco.imgs) + + self.db = [] + + self.pipeline = Compose(self.pipeline) + + @staticmethod + def _cam2pixel(cam_coord, f, c): + """Transform the joints from their camera coordinates to their pixel + coordinates. + + Note: + N: number of joints + + Args: + cam_coord (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + img_coord (ndarray[N, 3]): the coordinates (x, y, 0) + in the image plane. + """ + x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0] + y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1] + z = np.zeros_like(x) + img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1) + return img_coord + + @staticmethod + def _world2cam(world_coord, R, T): + """Transform the joints from their world coordinates to their camera + coordinates. + + Note: + N: number of joints + + Args: + world_coord (ndarray[3, N]): 3D joints coordinates + in the world coordinate system + R (ndarray[3, 3]): camera rotation matrix + T (ndarray[3, 1]): camera position (x, y, z) + + Returns: + cam_coord (ndarray[3, N]): 3D joints coordinates + in the camera coordinate system + """ + cam_coord = np.dot(R, world_coord - T) + return cam_coord + + @staticmethod + def _pixel2cam(pixel_coord, f, c): + """Transform the joints from their pixel coordinates to their camera + coordinates. + + Note: + N: number of joints + + Args: + pixel_coord (ndarray[N, 3]): 3D joints coordinates + in the pixel coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + cam_coord (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + """ + x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2] + y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2] + z = pixel_coord[:, 2] + cam_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1) + return cam_coord + + @staticmethod + def _xyz2uvd(xyz, f, c): + """Transform the joints from their 3d xyz camera coordinates to their + 2.5D uvd coordinates. + + Note: + N: number of joints + + Args: + xyz (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + uvd (ndarray[N, 3]): the 2.5D coordinates (u, v, d) in the spatial. + """ + u = xyz[:, 0] / (xyz[:, 2] + 1e-8) * f[0] + c[0] + v = xyz[:, 1] / (xyz[:, 2] + 1e-8) * f[1] + c[1] + d = xyz[:, 2] + uvd = np.concatenate((u[:, None], v[:, None], d[:, None]), 1) + return uvd + + @staticmethod + def _uvd2xyz(uvd, f, c): + """Transform the joints from their 2.5D uvd coordinates to their 3D xyz + camera coordinates. + + Note: + N: number of joints + + Args: + uvd (ndarray[N, 3]): 3D joints coordinates + in the pixel coordinate system + f (ndarray[2]): focal length of x and y axis + c (ndarray[2]): principal point of x and y axis + + Returns: + xyz (ndarray[N, 3]): 3D joints coordinates + in the camera coordinate system + """ + x = (uvd[:, 0] - c[0]) / f[0] * uvd[:, 2] + y = (uvd[:, 1] - c[1]) / f[1] * uvd[:, 2] + z = uvd[:, 2] + xyz = np.concatenate((x[:, None], y[:, None], z[:, None]), 1) + return xyz + + @staticmethod + def _center2bounds(center_uvd, cube_size, f): + """ + + Args: + center_uvd (ndarray[1, 3]): + cube_size (ndarray[3]): + f (ndarray[2]): focal length of x and y axis + + Returns: + bounds (ndarray[1, 6]): 2.5D bounds + """ + + ustart = center_uvd[:, + 0] - (cube_size[0] / 2.) / center_uvd[:, 2] * f[0] + vstart = center_uvd[:, + 1] - (cube_size[1] / 2.) / center_uvd[:, 2] * f[1] + uend = center_uvd[:, 0] + (cube_size[0] / 2.) / center_uvd[:, 2] * f[0] + vend = center_uvd[:, 1] + (cube_size[1] / 2.) / center_uvd[:, 2] * f[1] + dstart = center_uvd[:, 2] - cube_size[2] / 2. + dend = center_uvd[:, 2] + cube_size[2] / 2. + bounds = np.concatenate( + (ustart[:, None], uend[:, None], vstart[:, None], vend[:, None], + dstart[:, None], dend[:, None]), 1) + return bounds + + @staticmethod + def _get_mapping_id_name(imgs): + """ + Args: + imgs (dict): dict of image info. + + Returns: + tuple: Image name & id mapping dicts. + + - id2name (dict): Mapping image id to name. + - name2id (dict): Mapping image name to id. + """ + id2name = {} + name2id = {} + for image_id, image in imgs.items(): + file_name = image['file_name'] + id2name[image_id] = file_name + name2id[file_name] = image_id + + return id2name, name2id + + def _xywh2cs(self, x, y, w, h, padding=1.25): + """This encodes bbox(x,y,w,h) into (center, scale) + + Args: + x, y, w, h (float): left, top, width and height + padding (float): bounding box padding factor + + Returns: + center (np.ndarray[float32](2,)): center of the bbox (x, y). + scale (np.ndarray[float32](2,)): scale of the bbox w & h. + """ + aspect_ratio = self.ann_info['image_size'][0] / self.ann_info[ + 'image_size'][1] + center = np.array([x + w * 0.5, y + h * 0.5], dtype=np.float32) + + if (not self.test_mode) and np.random.rand() < 0.3: + center += 0.4 * (np.random.rand(2) - 0.5) * [w, h] + + if w > aspect_ratio * h: + h = w * 1.0 / aspect_ratio + elif w < aspect_ratio * h: + w = h * aspect_ratio + + # pixel std is 200.0 + scale = np.array([w / 200.0, h / 200.0], dtype=np.float32) + # padding to include proper amount of context + scale = scale * padding + + return center, scale + + @abstractmethod + def _get_db(self): + """Load dataset.""" + raise NotImplementedError + + @abstractmethod + def evaluate(self, results, *args, **kwargs): + """Evaluate keypoint results.""" + + @staticmethod + def _write_keypoint_results(keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def __len__(self): + """Get the size of the dataset.""" + return len(self.db) + + def __getitem__(self, idx): + """Get the sample given index.""" + results = copy.deepcopy(self.db[idx]) + results['ann_info'] = self.ann_info + return self.pipeline(results) + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts + + def _report_metric(self, + res_file, + metrics, + pck_thr=0.2, + pckh_thr=0.7, + auc_nor=30): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE', 'NME'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.db) + + outputs = [] + gts = [] + masks = [] + box_sizes = [] + threshold_bbox = [] + threshold_head_box = [] + + for pred, item in zip(preds, self.db): + + self.ann_info['image_size'] + + # pred_joint_xyz = self._uvd2xyz( + # np.array(pred['keypoints'], dtype=np.float32), item['focal'], + # item['princpt']) + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0) + if 'PCK' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + box_sizes.append(item.get('box_size', 1)) + + outputs = np.array(outputs) + gts = np.array(gts) + masks = np.array(masks) + threshold_bbox = np.array(threshold_bbox) + threshold_head_box = np.array(threshold_head_box) + box_sizes = np.array(box_sizes).reshape([-1, 1]) + + if 'PCK' in metrics: + _, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, + threshold_bbox) + info_str.append(('PCK', pck)) + + if 'AUC' in metrics: + info_str.append(('AUC', keypoint_auc(outputs, gts, masks, + auc_nor))) + + if 'EPE' in metrics: + info_str.append(('EPE', keypoint_epe(outputs, gts, masks))) + + return info_str diff --git a/mmpose/datasets/datasets/hand/__init__.py b/mmpose/datasets/datasets/hand/__init__.py index 49159afa60..1fe5848032 100644 --- a/mmpose/datasets/datasets/hand/__init__.py +++ b/mmpose/datasets/datasets/hand/__init__.py @@ -3,6 +3,7 @@ from .hand_coco_wholebody_dataset import HandCocoWholeBodyDataset from .interhand2d_dataset import InterHand2DDataset from .interhand3d_dataset import InterHand3DDataset +from .nyuhand_dataset import NYUHandDataset from .onehand10k_dataset import OneHand10KDataset from .panoptic_hand2d_dataset import PanopticDataset from .rhd2d_dataset import Rhd2DDataset @@ -10,5 +11,5 @@ __all__ = [ 'FreiHandDataset', 'InterHand2DDataset', 'InterHand3DDataset', 'OneHand10KDataset', 'PanopticDataset', 'Rhd2DDataset', - 'HandCocoWholeBodyDataset' + 'HandCocoWholeBodyDataset', 'NYUHandDataset' ] diff --git a/mmpose/datasets/datasets/hand/nyuhand_dataset.py b/mmpose/datasets/datasets/hand/nyuhand_dataset.py new file mode 100644 index 0000000000..6c54557366 --- /dev/null +++ b/mmpose/datasets/datasets/hand/nyuhand_dataset.py @@ -0,0 +1,168 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings + +import json_tricks as json +import numpy as np +from mmcv import Config, deprecated_api_warning + +from mmpose.datasets.builder import DATASETS +from ..base import Kpt3dSviewDepthImgTopDownDataset + + +@DATASETS.register_module() +class NYUHandDataset(Kpt3dSviewDepthImgTopDownDataset): + """TODO, add more detail doc. + + Args: + ann_file (str): Path to the annotation file. + camera_file (str): Path to the camera file. + joint_file (str): Path to the joint file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + use_refined_center (bool): Using refined bbox center. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (str): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + camera_file, + joint_file, + img_prefix, + data_cfg, + pipeline, + use_refined_center=False, + align_uvd_xyz_direction=True, + dataset_info=None, + test_mode=False): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/nyu.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.ann_info['cube_size'] = np.array(data_cfg['cube_size']) + self.ann_info['use_different_joint_weights'] = False + + self.camera_file = camera_file + self.joint_file = joint_file + self.align_uvd_xyz_direction = align_uvd_xyz_direction + self.use_refined_center = use_refined_center + if self.align_uvd_xyz_direction: + self.flip_y = -1 + else: + self.flip_y = 1 + self.meter2millimeter = 1 / 1000. + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + with open(self.camera_file, 'r') as f: + cameras = json.load(f) + with open(self.joint_file, 'r') as f: + joints = json.load(f) + + gt_db = [] + bbox_id = 0 + for img_id in self.img_ids: + num_joints = self.ann_info['num_joints'] + + ann_id = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + ann = self.coco.loadAnns(ann_id)[0] + img = self.coco.loadImgs(img_id)[0] + + frame_idx = str(img['frame_idx']) + image_file = osp.join(self.img_prefix, self.id2name[img_id]) + + focal = np.array([cameras['fx'], cameras['fy']], dtype=np.float32) + principal_pt = np.array([cameras['cx'], cameras['cy']], + dtype=np.float32) + + joint_uvd = np.array( + joints[frame_idx]['joint_uvd'], dtype=np.float32) + joint_xyz = np.array( + joints[frame_idx]['joint_xyz'], dtype=np.float32) + joint_xyz[:, 1] *= self.flip_y + + # calculate bbox online + # using center_xyz and cube_size, then project to 2D as bbox + if self.use_refined_center: + center_xyz = np.array( + ann['center_refined_xyz'], + dtype=np.float32).reshape(-1, 1) + else: + center_xyz = np.mean(joint_xyz, axis=0, keepdims=True) + center_depth = center_xyz[0, 2] + center_uvd = self._xyz2uvd(center_xyz, focal, principal_pt) + + if self.test_mode and img_id >= 2440: + cube_size = np.array( + self.ann_info['cube_size'], dtype=np.float32) * 5.0 / 6.0 + else: + cube_size = np.array( + self.ann_info['cube_size'], dtype=np.float32) + + bounds_uvd = self._center2bounds(center_uvd, cube_size, focal) + bbox = np.array([ + bounds_uvd[0, 0], bounds_uvd[0, 2], bounds_uvd[0, 1] - + bounds_uvd[0, 0], bounds_uvd[0, 3] - bounds_uvd[0, 2] + ], + dtype=np.float32) + + valid_joints_idx = self.ann_info['dataset_channel'] + joint_valid = np.zeros(joint_xyz.shape[0], dtype=np.float32) + joint_valid[valid_joints_idx] = 1.0 + + # joint_3d will be normalized in pre-processing pipeline + # uv are processed by TopDownAffine + # depth are processed by DepthToTensor + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d[:, :2] = joint_uvd[:, :2] + joints_3d[:, 2] = joint_uvd[:, 2] + + joints_3d_visible[...] = np.minimum(1, joint_valid.reshape(-1, 1)) + + gt_db.append({ + 'image_file': image_file, + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'joints_cam': joint_xyz, + 'joints_uvd': joint_uvd, + 'cube_size': cube_size, + 'center_depth': center_depth, + 'focal': focal, + 'princpt': principal_pt, + 'dataset': self.dataset_name, + 'bbox': bbox, + 'bbox_score': 1, + 'bbox_id': bbox_id + }) + bbox_id = bbox_id + 1 + gt_db = sorted(gt_db, key=lambda x: x['bbox_id']) + + return gt_db + + @deprecated_api_warning(name_dict=dict(outputs='results')) + def evaluate(self, results, res_folder=None, metric='EPE', **kwargs): + raise NotImplementedError diff --git a/mmpose/datasets/pipelines/hand_transform.py b/mmpose/datasets/pipelines/hand_transform.py index b83e399c4e..e84ec87e8a 100644 --- a/mmpose/datasets/pipelines/hand_transform.py +++ b/mmpose/datasets/pipelines/hand_transform.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. +import mmcv import numpy as np +from torchvision.transforms import functional as F from mmpose.datasets.builder import PIPELINES from .top_down_transform import TopDownRandomFlip @@ -61,3 +63,88 @@ def __call__(self, results): results['target'] = target * np.ones(1, dtype=np.float32) results['target_weight'] = target_weight * np.ones(1, dtype=np.float32) return results + + +@PIPELINES.register_module() +class DepthToTensor: + """Transform depth image to Tensor. + TODO: add reference from AWR github + + Required key: 'img', 'cube_size', 'center_depth'. + + Modifies key: 'img'. + """ + + def __init__(self): + pass + + def __call__(self, results): + if isinstance(results['img'], (list, tuple)): + results['img'] = [ + F.to_tensor(self._process_depth(img, results)) + for img in results['img'] + ] + else: + depth = self._process_depth(results['img'], results) + results['img'] = F.to_tensor(depth) + return results + + @staticmethod + def _process_depth(img, results): + depth = np.asarray(img[:, :, 0] + img[:, :, 1] * 256, dtype=np.float32) + img_max = np.max(depth) + depth_max = results['center_depth'] + (results['cube_size'][2] / 2.) + depth_min = results['center_depth'] - (results['cube_size'][2] / 2.) + depth[depth == img_max] = depth_max + depth[depth == 0] = depth_max + depth = np.clip(depth, depth_min, depth_max) + depth = (depth - results['center_depth']) / ( + results['cube_size'][2] / 2.) + return depth + + +@PIPELINES.register_module() +class HandGenerateJointToOffset: + """""" + + def __init__(self, heatmap_kernel_size): + self.heatmap_kernel_size = heatmap_kernel_size + + def __call__(self, results): + cfg = results['ann_info'] + feature_size = cfg['heatmap_size'] + joint_uvd = results['target'] # UV -1,1 + num_joints = joint_uvd.shape[0] + + img = results['img'] + depth = img.numpy()[0] # it is a hack + + coord_x = (2.0 * (np.arange(feature_size[0]) + 0.5) / feature_size[0] - + 1.0).astype(np.float32) + coord_y = (2.0 * (np.arange(feature_size[1]) + 0.5) / feature_size[1] - + 1.0).astype(np.float32) + xv, yv = np.meshgrid(coord_x, coord_y) + coord = np.stack((xv, yv), 0) + depth_resize = mmcv.imresize( + depth, (feature_size[0], feature_size[1]), interpolation='nearest') + depth_resize = np.expand_dims(depth_resize, 0) + coord_with_depth = np.expand_dims( + np.concatenate((coord, depth_resize), 0), 0) + jt_ft = np.broadcast_to(joint_uvd[:, :, np.newaxis, np.newaxis], + (joint_uvd.shape[0], joint_uvd.shape[1], + feature_size[0], feature_size[1])) + offset = jt_ft - coord_with_depth # [jt_num, 3, F, F] + dis = np.linalg.norm(offset + 1e-8, axis=1) # [jt_num, F, F] + offset_norm = offset / dis[:, np.newaxis, ...] # value in [-1, 1] + heatmap = (self.heatmap_kernel_size - + dis) / self.heatmap_kernel_size # [jt_num, F, F] + mask = (heatmap > 0).astype(np.float32) * (depth_resize < 0.99).astype( + np.float32) # [jt_num, F, F] + offset_norm_mask = (offset_norm * mask[:, None, ...]).reshape( + -1, feature_size[0], feature_size[1]) + heatmap_mask = heatmap * mask + offset_field = np.concatenate((offset_norm_mask, heatmap_mask), + axis=0) # [jt_num*4, F, F] + results['target'] = offset_field + results['target_weight'] = np.ones(num_joints) + return results diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py index c230870eaf..886d15e9ac 100644 --- a/mmpose/datasets/pipelines/top_down_transform.py +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -726,10 +726,23 @@ class TopDownGenerateTargetRegression: Required key: 'joints_3d', 'joints_3d_visible', 'ann_info'. Modified key: 'target', and 'target_weight'. + + Args: + use_zero_mean: (bool) If set to True, target normalize to [-1, 1], + otherwise [0,1] + joint_indices: (list): Indices of joints used for heatmap generation. + If None (default) is given, all joints will be used. """ - def __init__(self): - pass + def __init__(self, + use_zero_mean=False, + joint_indices=None, + is_3d=False, + normalize_depth=False): + self.use_zero_mean = use_zero_mean + self.joint_indices = joint_indices + self.is_3d = is_3d + self.normalize_depth = normalize_depth def _generate_target(self, cfg, joints_3d, joints_3d_visible): """Generate the target regression vector. @@ -746,20 +759,41 @@ def _generate_target(self, cfg, joints_3d, joints_3d_visible): joint_weights = cfg['joint_weights'] use_different_joint_weights = cfg['use_different_joint_weights'] + # only preserve used joint if joint_indices is given + if self.joint_indices is not None and len(self.joint_indices) > 0: + joint_weights = joint_weights[self.joint_indices] + joints_3d = joints_3d[self.joint_indices] + joints_3d_visible = joints_3d_visible[self.joint_indices] + mask = (joints_3d[:, 0] >= 0) * ( joints_3d[:, 0] <= image_size[0] - 1) * (joints_3d[:, 1] >= 0) * ( joints_3d[:, 1] <= image_size[1] - 1) - target = joints_3d[:, :2] / image_size + keypoints_dim = 3 if self.is_3d else 2 + + if self.use_zero_mean: + target = joints_3d[:, :keypoints_dim] + target_2d = target[:, :2] / image_size + target_2d = 2 * target_2d - 1 + target[:, :2] = target_2d + else: + target = joints_3d[:, :keypoints_dim] + target_2d = target[:, :2] / image_size + target[:, :2] = target_2d target = target.astype(np.float32) - target_weight = joints_3d_visible[:, :2] * mask[:, None] + target_weight = joints_3d_visible[:, :keypoints_dim] * mask[:, None] if use_different_joint_weights: target_weight = np.multiply(target_weight, joint_weights) return target, target_weight + def _normalize_target(self, joints_3d, center_depth, cube_size): + joints_3d[:, 2] = (joints_3d[:, 2] - center_depth) / ( + cube_size[2] / 2.0) + return joints_3d + def __call__(self, results): """Generate the target heatmap.""" joints_3d = results['joints_3d'] @@ -769,6 +803,9 @@ def __call__(self, results): joints_3d, joints_3d_visible) + if self.is_3d and self.normalize_depth: + target = self._normalize_target(target, results['center_depth'], + results['cube_size']) results['target'] = target results['target_weight'] = target_weight diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index 2fc64a8af3..5ea977f824 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .alexnet import AlexNet +from .awr_resnet import AWRResNet from .cpm import CPM from .hourglass import HourglassNet from .hourglass_ae import HourglassAENet @@ -35,5 +36,6 @@ 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', 'LiteHRNet', 'V2VNet', 'HRFormer', 'PyramidVisionTransformer', - 'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer' + 'PyramidVisionTransformerV2', 'SwinTransformer', 'I3D', 'TCFormer', + 'AWRResNet' ] diff --git a/mmpose/models/backbones/awr_resnet.py b/mmpose/models/backbones/awr_resnet.py new file mode 100644 index 0000000000..752725b532 --- /dev/null +++ b/mmpose/models/backbones/awr_resnet.py @@ -0,0 +1,33 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from .resnet import ResNet + + +@BACKBONES.register_module() +class AWRResNet(ResNet): + """AWR ResNet backbone. + + Using a specialized stem scheme. + """ + + def __init__(self, **kwargs): + super().__init__(deep_stem=False, **kwargs) + + def _make_stem_layer(self, in_channels, stem_channels): + """Make stem layer for depth.""" + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=5, + stride=1, + padding=2, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py index d94d8b8aab..bf5d1f629a 100644 --- a/mmpose/models/detectors/__init__.py +++ b/mmpose/models/detectors/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) OpenMMLab. All rights reserved. from .associative_embedding import AssociativeEmbedding +from .depthhand_3d import Depthhand3D from .gesture_recognizer import GestureRecognizer from .interhand_3d import Interhand3D from .mesh import ParametricMesh @@ -13,5 +14,6 @@ __all__ = [ 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', 'PoseLifter', 'Interhand3D', 'PoseWarper', 'DetectAndRegress', - 'VoxelCenterDetector', 'VoxelSinglePose', 'GestureRecognizer' + 'VoxelCenterDetector', 'VoxelSinglePose', 'GestureRecognizer', + 'Depthhand3D' ] diff --git a/mmpose/models/detectors/depthhand_3d.py b/mmpose/models/detectors/depthhand_3d.py new file mode 100644 index 0000000000..ac33ab8749 --- /dev/null +++ b/mmpose/models/detectors/depthhand_3d.py @@ -0,0 +1,303 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import mmcv +import numpy as np +from mmcv.utils.misc import deprecated_api_warning + +from mmpose.core import imshow_keypoints, imshow_keypoints_3d +from ..builder import POSENETS +from .top_down import TopDown + + +@POSENETS.register_module() +class Depthhand3D(TopDown): + """Top-down depth-based 3d keypoints detector.""" + + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. list[Tensor], list[list[dict]]), with + the outer list indicating test time augmentations. + + Note: + - batch_size: N + - num_keypoints: K + - num_img_channel: C (Default: 3) + - img height: imgH + - img width: imgW + - heatmaps height: H + - heatmaps weight: W + + Args: + img (torch.Tensor[NxCximgHximgW]): Input images. + target (list[torch.Tensor]): Target heatmaps, relative hand + root depth and hand type. + target_weight (list[torch.Tensor]): Weights for target + heatmaps, relative hand root depth and hand type. + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + - "heatmap3d_depth_bound": depth bound of hand keypoint 3D + heatmap + - "root_depth_bound": depth bound of relative root depth 1D + heatmap + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + + Returns: + dict|tuple: if `return loss` is true, then return losses. \ + Otherwise, return predicted poses, boxes, image paths, \ + heatmaps, relative hand root depth and hand type. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test(img, img_metas, **kwargs) + + def forward_train(self, img, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + features = self.backbone(img) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + if self.train_cfg['use_img_for_head']: + output = self.keypoint_head((features, img)) + else: + output = self.keypoint_head(features) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, target, target_weight) + losses.update(keypoint_losses) + + # import ipdb + # ipdb.set_trace() + # + # from mmpose.core.post_processing import (affine_transform, + # get_affine_transform, + # get_warp_matrix, + # warp_affine_joints) + # + # used_index = img_metas[0]['dataset_channel'] + # jt_xyz_gt = img_metas[0]['joints_cam'][used_index] + # center = img_metas[0]['center'] + # scale = img_metas[0]['scale'] + # rotation = img_metas[0]['rotation'] + # image_size = img_metas[0]['image_size'] + # trans = get_affine_transform(center, scale, rotation, image_size) + # inv_trans = get_affine_transform( + # center, scale, rotation, image_size, inv=True) + # + # jt_uvd_pred = output[1][0].detach().cpu().numpy() + # jt_uvd_pred[:, :2] = (jt_uvd_pred[:, :2] + + # 1) * img_metas[0]['image_size'] / 2. + # jt_uvd_pred[:, 2] = jt_uvd_pred[:, 2] * img_metas[0]['cube_size'][ + # 2] / 2 + img_metas[0]['center_depth'] + # + # jt_uvd_gt = target[1][0].detach().cpu().numpy() + # jt_uvd_gt[:, :2] = (jt_uvd_gt[:, :2] + + # 1) * img_metas[0]['image_size'] / 2. + # jt_uvd_gt[:, 2] = jt_uvd_gt[:, 2] *\ + # img_metas[0]['cube_size'][2] / 2\ + # + img_metas[0]['center_depth'] + # + # for i in range(len(img_metas[0]['dataset_channel'])): + # jt_uvd_gt[i, 0:2] = affine_transform(jt_uvd_gt[i, 0:2].copy(), + # inv_trans) + # + # import ipdb + # ipdb.set_trace() + # from mmpose.datasets.datasets.base import \ + # Kpt3dSviewDepthImgTopDownDataset + # jt_xyz_gt_from_uvd = Kpt3dSviewDepthImgTopDownDataset._uvd2xyz( + # jt_uvd_gt, f=img_metas[0]['focal'], c=img_metas[0]['princpt']) + + return losses + + def forward_test(self, img, img_metas, **kwargs): + """Defines the computation performed at every call when testing.""" + assert img.size(0) == len(img_metas) + batch_size, _, img_height, img_width = img.shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + features = self.backbone(img) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + if self.train_cfg['use_img_for_head']: + output = self.keypoint_head.inference_model((features, img), + flip_pairs=None) + else: + output = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + raise NotImplementedError + + if self.with_keypoint: + result = self.keypoint_head.decode( + img_metas, output, img_size=[img_width, img_height]) + else: + result = {} + return result + + def forward_dummy(self, img): + """Used for computing network FLOPs. + + See ``tools/get_flops.py``. + + Args: + img (torch.Tensor): Input image. + + Returns: + Tensor: Output heatmaps. + """ + output = self.backbone(img) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + if self.train_cfg['use_img_for_head']: + output = self.keypoint_head((output, img)) + else: + output = self.keypoint_head(output) + return output + + @deprecated_api_warning({'pose_limb_color': 'pose_link_color'}, + cls_name='Depthhand3D') + def show_result( + self, # TODO: NotImplement + result, + img=None, + skeleton=None, + kpt_score_thr=0.3, + radius=8, + bbox_color='green', + thickness=2, + pose_kpt_color=None, + pose_link_color=None, + vis_height=400, + num_instances=-1, + win_name='', + show=False, + wait_time=0, + out_file=None): + """Visualize 3D pose estimation results. + + Args: + result (list[dict]): The pose estimation results containing: + + - "keypoints_3d" ([K,4]): 3D keypoints + - "keypoints" ([K,3] or [T,K,3]): Optional for visualizing + 2D inputs. If a sequence is given, only the last frame + will be used for visualization + - "bbox" ([4,] or [T,4]): Optional for visualizing 2D inputs + - "title" (str): title for the subplot + img (str or Tensor): Optional. The image to visualize 2D inputs on. + skeleton (list of [idx_i,idx_j]): Skeleton described by a list of + links, each is a pair of joint indices. + kpt_score_thr (float, optional): Minimum score of keypoints + to be shown. Default: 0.3. + radius (int): Radius of circles. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + thickness (int): Thickness of lines. + pose_kpt_color (np.array[Nx3]`): Color of N keypoints. + If None, do not draw keypoints. + pose_link_color (np.array[Mx3]): Color of M limbs. + If None, do not draw limbs. + vis_height (int): The image height of the visualization. The width + will be N*vis_height depending on the number of visualized + items. + num_instances (int): Number of instances to be shown in 3D. If + smaller than 0, all the instances in the pose_result will be + shown. Otherwise, pad or truncate the pose_result to a length + of num_instances. + win_name (str): The window name. + show (bool): Whether to show the image. Default: False. + wait_time (int): Value of waitKey param. + Default: 0. + out_file (str or None): The filename to write the image. + Default: None. + + Returns: + Tensor: Visualized img, only if not `show` or `out_file`. + """ + if num_instances < 0: + assert len(result) > 0 + result = sorted(result, key=lambda x: x.get('track_id', 0)) + + # draw image and 2d poses + if img is not None: + img = mmcv.imread(img) + + bbox_result = [] + pose_2d = [] + for res in result: + if 'bbox' in res: + bbox = np.array(res['bbox']) + if bbox.ndim != 1: + assert bbox.ndim == 2 + bbox = bbox[-1] # Get bbox from the last frame + bbox_result.append(bbox) + if 'keypoints' in res: + kpts = np.array(res['keypoints']) + if kpts.ndim != 2: + assert kpts.ndim == 3 + kpts = kpts[-1] # Get 2D keypoints from the last frame + pose_2d.append(kpts) + + if len(bbox_result) > 0: + bboxes = np.vstack(bbox_result) + mmcv.imshow_bboxes( + img, + bboxes, + colors=bbox_color, + top_k=-1, + thickness=2, + show=False) + if len(pose_2d) > 0: + imshow_keypoints( + img, + pose_2d, + skeleton, + kpt_score_thr=kpt_score_thr, + pose_kpt_color=pose_kpt_color, + pose_link_color=pose_link_color, + radius=radius, + thickness=thickness) + img = mmcv.imrescale(img, scale=vis_height / img.shape[0]) + + img_vis = imshow_keypoints_3d( + result, + img, + skeleton, + pose_kpt_color, + pose_link_color, + vis_height, + axis_limit=300, + axis_azimuth=-115, + axis_elev=15, + kpt_score_thr=kpt_score_thr, + num_instances=num_instances) + + if show: + mmcv.visualization.imshow(img_vis, win_name, wait_time) + + if out_file is not None: + mmcv.imwrite(img_vis, out_file) + + return img_vis diff --git a/mmpose/models/heads/__init__.py b/mmpose/models/heads/__init__.py index 459c20b8bd..e72763ab9a 100644 --- a/mmpose/models/heads/__init__.py +++ b/mmpose/models/heads/__init__.py @@ -2,6 +2,7 @@ from .ae_higher_resolution_head import AEHigherResolutionHead from .ae_multi_stage_head import AEMultiStageHead from .ae_simple_head import AESimpleHead +from .awr_head import AdaptiveWeightingRegression3DHead from .deconv_head import DeconvHead from .deeppose_regression_head import DeepposeRegressionHead from .hmr_head import HMRMeshHead @@ -21,5 +22,5 @@ 'AEHigherResolutionHead', 'AESimpleHead', 'AEMultiStageHead', 'DeepposeRegressionHead', 'TemporalRegressionHead', 'Interhand3DHead', 'HMRMeshHead', 'DeconvHead', 'ViPNASHeatmapSimpleHead', 'CuboidCenterHead', - 'CuboidPoseHead', 'MultiModalSSAHead' + 'CuboidPoseHead', 'MultiModalSSAHead', 'AdaptiveWeightingRegression3DHead' ] diff --git a/mmpose/models/heads/awr_head.py b/mmpose/models/heads/awr_head.py new file mode 100644 index 0000000000..e29664c89d --- /dev/null +++ b/mmpose/models/heads/awr_head.py @@ -0,0 +1,458 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from mmcv.cnn import build_upsample_layer, constant_init, normal_init + +from mmpose.core.camera import SimpleCamera +from mmpose.core.evaluation.top_down_eval import keypoints_from_joint_uvd +from mmpose.models.builder import build_loss +from ..builder import HEADS + + +class OffsetHead(nn.Module): + + def __init__(self, + in_channels, + out_channels_vector, + out_channels_scalar, + heatmap_kernel_size, + dummy_args=None): + + super().__init__() + + self.heatmap_kernel_size = heatmap_kernel_size + assert out_channels_vector == out_channels_scalar * 3 + self.vector_offset = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels_vector, + kernel_size=1, + stride=1, + padding=0) + self.scalar_offset = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels_scalar, + kernel_size=1, + stride=1, + padding=0) + + def forward(self, x): + """Forward function.""" + vec = self.vector_offset(x) + ht = self.scalar_offset(x) + # N, C, H, W = x.shape + offset_field = torch.cat((vec, ht), dim=1) + return offset_field + + def init_weights(self): + """Initialize model weights.""" + for m in self.vector_offset.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + for m in self.scalar_offset.modules(): + if isinstance(m, nn.Conv2d): + nn.init.normal_(m.weight, std=0.001) + nn.init.constant_(m.bias, 0) + + +class UpsampleHead(nn.Module): + """UpsampleHead is a sub-module of AWR Head, and outputs 3D heatmaps. + UpsampleHead is composed of (>=0) number of deconv layers. + + Args: + in_channels (int): Number of input channels + out_channels (int): Number of output channels + depth_size (int): Number of depth discretization size + num_deconv_layers (int): Number of deconv layers. + num_deconv_layers should >= 0. Note that 0 means no deconv layers. + num_deconv_filters (list|tuple): Number of filters. + num_deconv_kernels (list|tuple): Kernel sizes. + extra (dict): Configs for extra conv layers. Default: None + """ + + def __init__(self, + in_channels, + out_channels, + depth_size=64, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=None): + + super().__init__() + + assert out_channels % depth_size == 0 + self.depth_size = depth_size + self.in_channels = in_channels + + if extra is not None and not isinstance(extra, dict): + raise TypeError('extra should be dict or None.') + + if num_deconv_layers > 0: + self.deconv_layers = self._make_deconv_layer( + num_deconv_layers, + num_deconv_filters, + num_deconv_kernels, + ) + elif num_deconv_layers == 0: + self.deconv_layers = nn.Identity() + else: + raise ValueError( + f'num_deconv_layers ({num_deconv_layers}) should >= 0.') + + identity_final_layer = False + if extra is not None and 'final_conv_kernel' in extra: + assert extra['final_conv_kernel'] in [0] + identity_final_layer = True + + if identity_final_layer: + self.final_layer = nn.Identity() + else: + # TODO: do not support this type of layer configuration + raise NotImplementedError + + def _make_deconv_layer(self, num_layers, num_filters, num_kernels): + """Make deconv layers.""" + if num_layers != len(num_filters): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_filters({len(num_filters)})' + raise ValueError(error_msg) + if num_layers != len(num_kernels): + error_msg = f'num_layers({num_layers}) ' \ + f'!= length of num_kernels({len(num_kernels)})' + raise ValueError(error_msg) + + layers = [] + for i in range(num_layers): + kernel, padding, output_padding = \ + self._get_deconv_cfg(num_kernels[i]) + + planes = num_filters[i] + layers.append( + build_upsample_layer( + dict(type='deconv'), + in_channels=self.in_channels, + out_channels=planes, + kernel_size=kernel, + stride=2, + padding=padding, + output_padding=output_padding, + bias=False)) + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + self.in_channels = planes + + return nn.Sequential(*layers) + + @staticmethod + def _get_deconv_cfg(deconv_kernel): + """Get configurations for deconv layers.""" + if deconv_kernel == 4: + padding = 1 + output_padding = 0 + elif deconv_kernel == 3: + padding = 1 + output_padding = 1 + elif deconv_kernel == 2: + padding = 0 + output_padding = 0 + else: + raise ValueError(f'Not supported num_kernels ({deconv_kernel}).') + + return deconv_kernel, padding, output_padding + + def forward(self, x): + """Forward function.""" + x = self.deconv_layers(x) + x = self.final_layer(x) + # N, C, H, W = x.shape + # # reshape the 2D heatmap to 3D heatmap + # x = x.reshape(N, C // self.depth_size, self.depth_size, H, W) + return x + + def init_weights(self): + """Initialize model weights.""" + for _, m in self.deconv_layers.named_modules(): + if isinstance(m, nn.ConvTranspose2d): + normal_init(m, std=0.001) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + for m in self.final_layer.modules(): + if isinstance(m, nn.Conv2d): + normal_init(m, std=0.001, bias=0) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + + +@HEADS.register_module() +class AdaptiveWeightingRegression3DHead(nn.Module): + """ + + Args: + deconv_head_cfg (dict): Configs of UpsampleHead for hand + keypoint estimation. + offset_head_cfg (dict): Configs of OffsetHead for hand + keypoint offset field estimation. + loss_keypoint (dict): Config for keypoint loss. Default: None. + loss_offset (dict): Config for offset field loss. Default: None. + """ + + def __init__(self, + deconv_head_cfg, + offset_head_cfg, + loss_keypoint=None, + loss_offset=None, + train_cfg=None, + test_cfg=None): + super().__init__() + + self.deconv_head_cfg = deconv_head_cfg + self.offset_head_cfg = offset_head_cfg + + # build sub-module heads + # dense head + self.offset_head = OffsetHead(**offset_head_cfg) + # regression head + self.upsample_feature_head = UpsampleHead(**deconv_head_cfg) + + # build losses + self.keypoint_loss = build_loss(loss_keypoint) + self.offset_loss = build_loss(loss_offset) + self.train_cfg = {} if train_cfg is None else train_cfg + self.test_cfg = {} if test_cfg is None else test_cfg + + def init_weights(self): + self.upsample_feature_head.init_weights() + self.offset_head.init_weights() + + @staticmethod + def offset2joint_softmax(offset, img, kernel_size): + batch_size, feature_num, feature_size, _ = offset.size() + jt_num = int(feature_num / 4) + img = F.interpolate( + img, size=[feature_size, feature_size]) # (B, 1, F, F) + # unit directional vector + offset_vec = offset[:, :jt_num * 3].contiguous() # (B, jt_num*3, F, F) + # closeness heatmap + offset_ht = offset[:, jt_num * 3:].contiguous() # (B, jt_num, F, F) + + mesh_x = 2.0 * (torch.arange(feature_size).unsqueeze(0).expand( + feature_size, feature_size).float() + 0.5) / feature_size - 1.0 + mesh_y = 2.0 * (torch.arange(feature_size).unsqueeze(1).expand( + feature_size, feature_size).float() + 0.5) / feature_size - 1.0 + coords = torch.stack((mesh_x, mesh_y), dim=0) + coords = coords.unsqueeze(0).repeat(batch_size, 1, 1, + 1).to(offset.device) + coords = torch.cat((coords, img), + dim=1).repeat(1, jt_num, 1, + 1) # (B, jt_num*3, F, F) + coords = coords.view(batch_size, jt_num, 3, -1) # (B, jt_num, 3, F*F) + + mask = img.lt(0.99).float() # (B, 1, F, F) + offset_vec_mask = (offset_vec * mask).view(batch_size, jt_num, 3, + -1) # (B, jt_num, 3, F*F) + offset_ht_mask = (offset_ht * mask).view(batch_size, jt_num, + -1) # (B, jt_num, F*F) + offset_ht_norm = F.softmax( + offset_ht_mask * 30, dim=-1) # (B, jt_num, F*F) + dis = kernel_size - offset_ht_mask * kernel_size # (B, jt_num, F*F) + + jt_uvd = torch.sum( + (offset_vec_mask * dis.unsqueeze(2) + coords) * + offset_ht_norm.unsqueeze(2), + dim=-1) + + return jt_uvd.float() + + @staticmethod + def joint2offset(jt_uvd, img, kernel_size, feature_size): + """ + :params joint: hand joint coordinates, shape (B, joint_num, 3) + :params img: depth image, shape (B, C, H, W) + :params kernel_size + :params feature_size: size of generated offsets feature + """ + batch_size, jt_num, _ = jt_uvd.size() + img = F.interpolate(img, size=[feature_size, feature_size]) + jt_ft = jt_uvd.view(batch_size, -1, 1, + 1).repeat(1, 1, feature_size, + feature_size) # (B, joint_num*3, F, F) + + mesh_x = 2.0 * (torch.arange(feature_size).unsqueeze(0).expand( + feature_size, feature_size).float() + 0.5) / feature_size - 1.0 + mesh_y = 2.0 * (torch.arange(feature_size).unsqueeze(1).expand( + feature_size, feature_size).float() + 0.5) / feature_size - 1.0 + coords = torch.stack((mesh_x, mesh_y), dim=0) + coords = coords.unsqueeze(0).repeat(batch_size, 1, 1, 1).to( + jt_uvd.device) # (B, 2, F, F) + coords = torch.cat((coords, img), + dim=1).repeat(1, jt_num, 1, + 1) # (B, jt_num*3, F, F) + + offset = jt_ft - coords # (B, jt_num*3, F, F) + offset = offset.view(batch_size, jt_num, 3, feature_size, + feature_size) # (B, jt_num, 3, F, F) + dis = torch.sqrt(torch.sum(torch.pow(offset, 2), dim=2) + + 1e-8) # (B, jt_num, F, F) + + offset_norm = offset / dis.unsqueeze(2) # (B, jt_num, 3, F, F) + heatmap = (kernel_size - dis) / kernel_size # (B, jt_num, F, F) + mask = heatmap.ge(0).float() * img.lt( + 0.99).float() # (B, jt_num, F, F) + + offset_norm_mask = (offset_norm * + mask.unsqueeze(2)).view(batch_size, -1, + feature_size, feature_size) + heatmap_mask = heatmap * mask.float() + return torch.cat((offset_norm_mask, heatmap_mask), dim=1).float() + + def get_loss(self, output, target, target_weight): + """Calculate loss for hand keypoint heatmaps, relative root depth and + hand type. + + Args: + output (list[Tensor]): a list of outputs from multiple heads. + target (list[Tensor]): a list of targets for multiple heads. + target_weight (list[Tensor]): a list of targets weight for + multiple heads. + """ + losses = dict() + + # hand keypoint offset field loss, dense loss + assert not isinstance(self.keypoint_loss, nn.Sequential) + out, tar, tar_weight = output[0], target[0], target_weight[0] + assert tar.dim() == 4 and tar_weight.dim() in [1, 2] + losses['offset_loss'] = self.offset_loss(out, tar) + # hand keypoint joint loss, regression loss + assert not isinstance(self.offset_loss, nn.Sequential) + out, tar, tar_weight = output[1], target[1], target_weight[1] + assert tar.dim() == 3 and tar_weight.dim() == 3 + losses['joint_loss'] = self.keypoint_loss(out, tar, tar_weight) + + return losses + + def forward(self, x): + """Forward function.""" + backbone_feature, img = x + feature = self.upsample_feature_head(backbone_feature) + offset_field = self.offset_head(feature) + jt_uvd = self.offset2joint_softmax( + offset_field, img, self.offset_head_cfg['heatmap_kernel_size']) + outputs = [offset_field, jt_uvd] + return outputs + + def inference_model(self, x, flip_pairs=None): + """Inference function. + + Returns: + output (list[np.ndarray]): list of output hand keypoint + heatmaps, relative root depth and hand type. + + Args: + x (torch.Tensor[N,K,H,W]): Input features. + flip_pairs (None | list[tuple()): + Pairs of keypoints which are mirrored. + """ + + output = self.forward(x) + + if flip_pairs is not None: + raise NotImplementedError + else: + output = [out.detach().cpu().numpy() for out in output] + + return output + + def decode(self, img_metas, output, **kwargs): + """Decode hand keypoint and offset field. + + Args: + img_metas (list(dict)): Information about data augmentation + By default this includes: + + - "image_file: path to the image file + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + - "heatmap3d_depth_bound": depth bound of hand keypoint + 3D heatmap + - "root_depth_bound": depth bound of relative root depth + 1D heatmap + output (list[np.ndarray]): model predicted 3D heatmaps, relative + root depth and hand type. + """ + + batch_size = len(img_metas) + result = {} + + center = np.zeros((batch_size, 2), dtype=np.float32) + scale = np.zeros((batch_size, 2), dtype=np.float32) + image_size = np.zeros((batch_size, 2), dtype=np.float32) + image_paths = [] + score = np.ones(batch_size, dtype=np.float32) + if 'bbox_id' in img_metas[0]: + bbox_ids = [] + else: + bbox_ids = None + + for i in range(batch_size): + center[i, :] = img_metas[i]['center'] + scale[i, :] = img_metas[i]['scale'] + image_size[i, :] = img_metas[i]['image_size'] + image_paths.append(img_metas[i]['image_file']) + + if 'bbox_score' in img_metas[i]: + score[i] = np.array(img_metas[i]['bbox_score']).reshape(-1) + if bbox_ids is not None: + bbox_ids.append(img_metas[i]['bbox_id']) + + all_boxes = np.zeros((batch_size, 6), dtype=np.float32) + all_boxes[:, 0:2] = center[:, 0:2] + all_boxes[:, 2:4] = scale[:, 0:2] + # scale is defined as: bbox_size / 200.0, so we + # need multiply 200.0 to get bbox size + all_boxes[:, 4] = np.prod(scale * 200.0, axis=1) + all_boxes[:, 5] = score + result['boxes'] = all_boxes + result['image_paths'] = image_paths + result['bbox_ids'] = bbox_ids + + # transform keypoint depth to camera space + joint_uvd = output[1] + preds, maxvals = keypoints_from_joint_uvd(joint_uvd, center, scale, + image_size) + keypoints_3d = np.zeros((batch_size, joint_uvd.shape[1], 4), + dtype=np.float32) + keypoints_3d[:, :, 0:3] = preds[:, :, 0:3] + keypoints_3d[:, :, 3:4] = maxvals + + center_depth = np.array( + [img_metas[i]['center_depth'] for i in range(len(img_metas))], + dtype=np.float32) + cube_size = np.array( + [img_metas[i]['cube_size'] for i in range(len(img_metas))], + dtype=np.float32) + keypoints_3d[:, :, 2] = \ + keypoints_3d[:, :, 2] * cube_size[:, 2:] / 2 \ + + center_depth[:, np.newaxis] + + result['preds'] = keypoints_3d + # joint uvd to joint xyz + cam_param = { + 'R': np.eye(3, dtype=np.float32), + 'T': np.zeros((3, 1), dtype=np.float32), + 'f': img_metas[0]['focal'].reshape(2, 1), + 'c': img_metas[0]['princpt'].reshape(2, 1), + } + single_view_camera = SimpleCamera(param=cam_param) + keypoints_xyz_list = [] + for batch_idx in range(batch_size): + keypoints_xyz_list.append( + single_view_camera.pixel_to_camera( + keypoints_3d[batch_idx, :, :3])) + result['preds_xyz'] = np.stack(keypoints_xyz_list, 0) + + return result diff --git a/mmpose/models/losses/__init__.py b/mmpose/models/losses/__init__.py index 9a491fbf76..34c2eee3e5 100644 --- a/mmpose/models/losses/__init__.py +++ b/mmpose/models/losses/__init__.py @@ -4,13 +4,14 @@ from .mesh_loss import GANLoss, MeshLoss from .mse_loss import JointsMSELoss, JointsOHKMMSELoss from .multi_loss_factory import AELoss, HeatmapLoss, MultiLossFactory -from .regression_loss import (BoneLoss, L1Loss, MPJPELoss, MSELoss, RLELoss, - SemiSupervisionLoss, SmoothL1Loss, SoftWingLoss, - WingLoss) +from .regression_loss import (AWRSmoothL1Loss, BoneLoss, L1Loss, MPJPELoss, + MSELoss, RLELoss, SemiSupervisionLoss, + SmoothL1Loss, SoftWingLoss, WingLoss) __all__ = [ 'JointsMSELoss', 'JointsOHKMMSELoss', 'HeatmapLoss', 'AELoss', 'MultiLossFactory', 'MeshLoss', 'GANLoss', 'SmoothL1Loss', 'WingLoss', 'MPJPELoss', 'MSELoss', 'L1Loss', 'BCELoss', 'BoneLoss', - 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss' + 'SemiSupervisionLoss', 'SoftWingLoss', 'AdaptiveWingLoss', 'RLELoss', + 'AWRSmoothL1Loss' ] diff --git a/mmpose/models/losses/regression_loss.py b/mmpose/models/losses/regression_loss.py index fc7aa33847..25ed066198 100644 --- a/mmpose/models/losses/regression_loss.py +++ b/mmpose/models/losses/regression_loss.py @@ -528,3 +528,46 @@ def forward(self, output, target): losses['bone_loss'] = loss_bone return losses + + +@LOSSES.register_module() +class AWRSmoothL1Loss(nn.Module): + """L1Loss loss .""" + + def __init__(self, use_target_weight=False, loss_weight=1.): + super().__init__() + self.use_target_weight = use_target_weight + self.loss_weight = loss_weight + + def forward(self, output, target, target_weight=None): + """Forward function. + + Note: + - batch_size: N + - num_keypoints: K + + Args: + output (torch.Tensor[N, K, 3]): Output regression. + target (torch.Tensor[N, K, 3]): Target regression. + target_weight (torch.Tensor[N, K, 3]): + Weights across different joint types. + """ + assert (output.shape == target.shape) + if self.use_target_weight: + assert target_weight is not None + z = (output * target_weight - target * target_weight) + else: + z = (output - target) + mse_mask = (torch.abs(z) < 0.01).to(dtype=z.dtype, device=z.device) + l1_mask = (torch.abs(z) >= 0.01).to(dtype=z.dtype, device=z.device) + mse = mse_mask * z + l1 = l1_mask * z + loss = torch.mean(self._calculate_MSE(mse) * mse_mask) + torch.mean( + self._calculate_L1(l1) * l1_mask) + return loss + + def _calculate_MSE(self, z): + return 0.5 * (torch.pow(z, 2)) + + def _calculate_L1(self, z): + return 0.01 * (torch.abs(z) - 0.005) diff --git a/tests/test_models/test_awr_3d_head.py b/tests/test_models/test_awr_3d_head.py new file mode 100644 index 0000000000..e5325ee7eb --- /dev/null +++ b/tests/test_models/test_awr_3d_head.py @@ -0,0 +1,91 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmpose.models import AdaptiveWeightingRegression3DHead + + +def test_awr_3d_head(): + N = 4 + input_shape = (N, 2048, 8, 8) + inputs = torch.rand(input_shape, dtype=torch.float32) + + img_input_shape = (N, 1, 128, 128) + img_inputs = torch.rand(img_input_shape, dtype=torch.float32) + + target = [ + inputs.new_ones(N, 14 * 4, 64, 64), + inputs.new_ones(N, 14, 3), + ] + target_weight = [ + inputs.new_ones(N, 14), + inputs.new_ones(N, 14, 3), + ] + + cameras = {'fx': 588.03, 'fy': 587.07, 'cx': 320.0, 'cy': 240.0} + + img_metas = [{ + 'img_shape': (128, 128, 3), + 'center': np.array([112, 112]), + 'scale': np.array([0.5, 0.5]), + 'bbox_score': 1.0, + 'bbox_id': 0, + 'flip_pairs': [], + 'inference_channel': np.arange(14), + 'cube_size': np.array([300, 300, 300]), + 'center_depth': 1.0, + 'focal': np.array([cameras['fx'], cameras['fy']]), + 'princpt': np.array([cameras['cx'], cameras['cy']]), + 'image_file': '.png', + } for _ in range(N)] + + print('fake input OK') + + head = AdaptiveWeightingRegression3DHead( + deconv_head_cfg=dict( + in_channels=2048, + out_channels=256, + depth_size=64, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=dict(final_conv_kernel=0, )), + offset_head_cfg=dict( + in_channels=256, + out_channels_vector=42, + out_channels_scalar=14, + heatmap_kernel_size=0.4, + ), + loss_keypoint=dict(type='AWRSmoothL1Loss', use_target_weight=True), + loss_offset=dict(type='AWRSmoothL1Loss', use_target_weight=False), + train_cfg=dict(use_img_for_head=True), + test_cfg=dict(use_img_for_head=True, flip_test=False)) + + print('init OK') + + head.init_weights() + + # test forward + inputs_with_img = (inputs, img_inputs) + output = head(inputs_with_img) + assert isinstance(output, list) + assert len(output) == 2 + assert output[0].shape == (N, 14 * 4, 64, 64) + assert output[1].shape == (N, 14, 3) + + # test loss computation + losses = head.get_loss(output, target, target_weight) + assert 'joint_loss' in losses + assert 'offset_loss' in losses + + # test inference model + output = head.inference_model(inputs_with_img, flip_pairs=None) + assert isinstance(output, list) + assert len(output) == 2 + assert output[0].shape == (N, 14 * 4, 64, 64) + assert output[1].shape == (N, 14, 3) + + # test decode + result = head.decode(img_metas, output) + assert 'preds' in result + assert 'preds_xyz' in result diff --git a/tests/test_models/test_depthhand_3d_forward.py b/tests/test_models/test_depthhand_3d_forward.py new file mode 100644 index 0000000000..241d4ef1f4 --- /dev/null +++ b/tests/test_models/test_depthhand_3d_forward.py @@ -0,0 +1,108 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from mmpose.models import build_posenet + + +def test_interhand3d_forward(): + # model settings + model_cfg = dict( + type='Depthhand3D', # pretrained=None + backbone=dict( + type='AWRResNet', + depth=50, + frozen_stages=-1, + zero_init_residual=False, + in_channels=1), + keypoint_head=dict( + type='AdaptiveWeightingRegression3DHead', + offset_head_cfg=dict( + in_channels=256, + out_channels_vector=42, + out_channels_scalar=14, + heatmap_kernel_size=0.4, + ), + deconv_head_cfg=dict( + in_channels=2048, + out_channels=256, + depth_size=64, + num_deconv_layers=3, + num_deconv_filters=(256, 256, 256), + num_deconv_kernels=(4, 4, 4), + extra=dict(final_conv_kernel=0, )), + loss_offset=dict(type='AWRSmoothL1Loss', use_target_weight=False), + loss_keypoint=dict(type='AWRSmoothL1Loss', use_target_weight=True), + ), + train_cfg=dict(use_img_for_head=True), + test_cfg=dict(use_img_for_head=True, flip_test=False)) + + detector = build_posenet(model_cfg) + detector.init_weights() + + input_shape = (2, 1, 128, 128) + mm_inputs = _demo_mm_inputs(input_shape) + + imgs = mm_inputs.pop('imgs') + target = mm_inputs.pop('target') + target_weight = mm_inputs.pop('target_weight') + img_metas = mm_inputs.pop('img_metas') + + # Test forward train + losses = detector.forward( + imgs, target, target_weight, img_metas, return_loss=True) + assert isinstance(losses, dict) + + # Test forward test + with torch.no_grad(): + _ = detector.forward(imgs, img_metas=img_metas, return_loss=False) + _ = detector.forward_dummy(imgs) + + +def _demo_mm_inputs(input_shape=(1, 1, 128, 128), num_outputs=None): + """Create a superset of inputs needed to run test or train batches. + + Args: + input_shape (tuple): + input batch dimensions + """ + (N, C, H, W) = input_shape + + rng = np.random.RandomState(0) + + imgs = rng.rand(*input_shape) + imgs = torch.FloatTensor(imgs) + + target = [ + imgs.new_ones(N, 14 * 4, 64, 64), + imgs.new_ones(N, 14, 3), + ] + target_weight = [ + imgs.new_ones(N, 14), + imgs.new_ones(N, 14, 3), + ] + + cameras = {'fx': 588.03, 'fy': 587.07, 'cx': 320.0, 'cy': 240.0} + + img_metas = [{ + 'img_shape': (128, 128, 3), + 'center': np.array([112, 112]), + 'scale': np.array([0.5, 0.5]), + 'bbox_score': 1.0, + 'bbox_id': 0, + 'flip_pairs': [], + 'inference_channel': np.arange(14), + 'cube_size': np.array([300, 300, 300]), + 'center_depth': 1.0, + 'focal': np.array([cameras['fx'], cameras['fy']]), + 'princpt': np.array([cameras['cx'], cameras['cy']]), + 'image_file': '.png', + } for _ in range(N)] + + mm_inputs = { + 'imgs': imgs.requires_grad_(True), + 'target': target, + 'target_weight': target_weight, + 'img_metas': img_metas + } + return mm_inputs